From e9ab62006773031b6633854a8d42a92481fdedf6 Mon Sep 17 00:00:00 2001 From: lishengbao <563167901@qq.com> Date: Mon, 15 Aug 2022 20:12:47 +0800 Subject: [PATCH 01/21] add release infomation --- grimoire_elk/enriched/github.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/grimoire_elk/enriched/github.py b/grimoire_elk/enriched/github.py index 72908e2de..2145c18f3 100644 --- a/grimoire_elk/enriched/github.py +++ b/grimoire_elk/enriched/github.py @@ -647,6 +647,25 @@ def __get_rich_repo(self, item): rich_repo['fetched_on'] = repo['fetched_on'] rich_repo['url'] = repo['html_url'] + rich_releases = [] + for release in repo['releases'] : + rich_releases_dict = {} + rich_releases_dict['id'] = release['id'] + rich_releases_dict['tag_name'] = release['tag_name'] + rich_releases_dict['target_commitish'] = release['target_commitish'] + rich_releases_dict['prerelease'] = release['prerelease'] + rich_releases_dict['name'] = release['name'] + rich_releases_dict['body'] = release['body'] + rich_releases_dict['created_at'] = release['created_at'] + release_author = release['author'] + rich_releases_author_dict = {} + rich_releases_author_dict['login'] = release_author['login'] + rich_releases_author_dict['name'] = '' + rich_releases_dict['author'] = rich_releases_author_dict + rich_releases.append(rich_releases_dict) + rich_repo['releases'] = rich_releases + rich_repo['releases_count'] = len(rich_releases) + if self.prjs_map: rich_repo.update(self.get_item_project(rich_repo)) From 26befd4aca1b3f6f63036cfecd0997bd2cfee67d Mon Sep 17 00:00:00 2001 From: chenqi Date: Thu, 18 Aug 2022 19:48:44 +0800 Subject: [PATCH 02/21] add comments data without bot Signed-off-by: chenqi --- grimoire_elk/enriched/github.py | 65 ++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/grimoire_elk/enriched/github.py b/grimoire_elk/enriched/github.py index 72908e2de..e2f2698ac 100644 --- a/grimoire_elk/enriched/github.py +++ b/grimoire_elk/enriched/github.py @@ -203,6 +203,58 @@ def get_time_to_merge_request_response(self, item): return None + #get comments and exclude bot + def get_num_of_comments_without_bot(self, item): + """Get the num of comment was made to the issue by someone + other than the user who created the issue and bot + """ + comments = [comment for comment in item['comments_data'] + if item['user']['login'] != comment['user']['login'] \ + and 'bot' not in comment['user']['login']] + return len(comments) + + def get_num_of_reviews_without_bot(self, item): + """Get the num of comment was made to the issue by someone + other than the user who created the issue and bot + """ + review_comments = [] + for comment in item['review_comments_data']: + # skip comments of ghost users + if not comment['user']: + continue + + # skip comments of the pull request creator + if 'bot' in comment['user']['login'] or item['user']['login'] == comment['user']['login'] : + continue + + review_comments.append(comment) + + return len(review_comments) + + #get first attendtion without bot + def get_time_to_first_attention_without_bot(self, item): + """Get the first date at which a comment was made to the issue by someone + other than the user who created the issue and bot + """ + comment_dates = [str_to_datetime(comment['created_at']) for comment in item['comments_data'] + if item['user']['login'] != comment['user']['login'] and 'bot' not in comment['user']['login']] + if comment_dates: + return min(comment_dates) + return None + + #get first attendtion without bot + def get_time_to_first_review_attention_without_bot(self, item): + """Get the first date at which a comment was made to the issue by someone + other than the user who created the issue and bot + """ + if 'review_comments_data' in item: + comment_dates = [str_to_datetime(comment['created_at']) for comment in item['review_comments_data'] + if 'login' in item['user'] and 'login' in comment['user'] and item['user']['login'] != comment['user']['login'] and 'bot' not in comment['user']['login']] + if comment_dates: + return min(comment_dates) + else: + return None + def get_latest_comment_date(self, item): """Get the date of the latest comment on the issue/pr""" @@ -524,6 +576,11 @@ def __get_rich_pull(self, item): min_review_date = self.get_time_to_merge_request_response(pull_request) rich_pr['time_to_merge_request_response'] = \ get_time_diff_days(str_to_datetime(pull_request['created_at']), min_review_date) + rich_pr['num_review_comments_without_bot'] = \ + self.get_num_of_reviews_without_bot(pull_request) + rich_pr['time_to_first_attention_without_bot'] = \ + get_time_diff_days(str_to_datetime(pull_request['created_at']), + self.get_time_to_first_review_attention_without_bot(pull_request)) if self.prjs_map: rich_pr.update(self.get_item_project(rich_pr)) @@ -622,11 +679,17 @@ def __get_rich_issue(self, item): rich_issue['project'] = item['project'] rich_issue['time_to_first_attention'] = None + rich_issue['num_of_comments_without_bot'] = None if issue['comments'] + issue['reactions']['total_count'] != 0: rich_issue['time_to_first_attention'] = \ get_time_diff_days(str_to_datetime(issue['created_at']), self.get_time_to_first_attention(issue)) - + rich_issue['num_of_comments_without_bot'] = \ + self.get_num_of_comments_without_bot(issue) + rich_issue['time_to_first_attention_without_bot'] = \ + get_time_diff_days(str_to_datetime(issue['created_at']), + self.get_time_to_first_attention_without_bot(issue)) + rich_issue.update(self.get_grimoire_fields(issue['created_at'], "issue")) item[self.get_field_date()] = rich_issue[self.get_field_date()] From 9d4fb8b1492b9426047d8850217b0dca257c085c Mon Sep 17 00:00:00 2001 From: chenqi Date: Thu, 18 Aug 2022 19:50:52 +0800 Subject: [PATCH 03/21] add pr-linked-issue message Signed-off-by: chenqi --- grimoire_elk/enriched/github.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/grimoire_elk/enriched/github.py b/grimoire_elk/enriched/github.py index e2f2698ac..12c2d4396 100644 --- a/grimoire_elk/enriched/github.py +++ b/grimoire_elk/enriched/github.py @@ -582,6 +582,9 @@ def __get_rich_pull(self, item): get_time_diff_days(str_to_datetime(pull_request['created_at']), self.get_time_to_first_review_attention_without_bot(pull_request)) + if 'linked_issues_data' in pull_request: + rich_pr['linked_issues_count'] = pull_request['linked_issues_data'] + if self.prjs_map: rich_pr.update(self.get_item_project(rich_pr)) From 54cc2660c765cbd5a6844a89617054af11fc68c0 Mon Sep 17 00:00:00 2001 From: chenqi Date: Thu, 18 Aug 2022 23:44:41 +0800 Subject: [PATCH 04/21] debug for review_data is None --- grimoire_elk/enriched/github.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/grimoire_elk/enriched/github.py b/grimoire_elk/enriched/github.py index 12c2d4396..994675c36 100644 --- a/grimoire_elk/enriched/github.py +++ b/grimoire_elk/enriched/github.py @@ -247,9 +247,9 @@ def get_time_to_first_review_attention_without_bot(self, item): """Get the first date at which a comment was made to the issue by someone other than the user who created the issue and bot """ - if 'review_comments_data' in item: + if 'review_comments_data' in item and item['review_comments_data']: comment_dates = [str_to_datetime(comment['created_at']) for comment in item['review_comments_data'] - if 'login' in item['user'] and 'login' in comment['user'] and item['user']['login'] != comment['user']['login'] and 'bot' not in comment['user']['login']] + if 'login' in item['user'] and comment['user'] and item['user']['login'] != comment['user']['login'] and 'bot' not in comment['user']['login']] if comment_dates: return min(comment_dates) else: From 7e00e646490793a6dfe45af9b75d3ef035773de7 Mon Sep 17 00:00:00 2001 From: lishengbao <563167901@qq.com> Date: Wed, 31 Aug 2022 09:42:26 +0800 Subject: [PATCH 05/21] fix release bug --- grimoire_elk/enriched/github.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/grimoire_elk/enriched/github.py b/grimoire_elk/enriched/github.py index 6641e8452..589a080d9 100644 --- a/grimoire_elk/enriched/github.py +++ b/grimoire_elk/enriched/github.py @@ -714,21 +714,23 @@ def __get_rich_repo(self, item): rich_repo['url'] = repo['html_url'] rich_releases = [] - for release in repo['releases'] : - rich_releases_dict = {} - rich_releases_dict['id'] = release['id'] - rich_releases_dict['tag_name'] = release['tag_name'] - rich_releases_dict['target_commitish'] = release['target_commitish'] - rich_releases_dict['prerelease'] = release['prerelease'] - rich_releases_dict['name'] = release['name'] - rich_releases_dict['body'] = release['body'] - rich_releases_dict['created_at'] = release['created_at'] - release_author = release['author'] - rich_releases_author_dict = {} - rich_releases_author_dict['login'] = release_author['login'] - rich_releases_author_dict['name'] = '' - rich_releases_dict['author'] = rich_releases_author_dict - rich_releases.append(rich_releases_dict) + releases = repo.get('releases') + if releases: + for release in releases: + rich_releases_dict = {} + rich_releases_dict['id'] = release['id'] + rich_releases_dict['tag_name'] = release['tag_name'] + rich_releases_dict['target_commitish'] = release['target_commitish'] + rich_releases_dict['prerelease'] = release['prerelease'] + rich_releases_dict['name'] = release['name'] + rich_releases_dict['body'] = release['body'] + rich_releases_dict['created_at'] = release['created_at'] + release_author = release['author'] + rich_releases_author_dict = {} + rich_releases_author_dict['login'] = release_author['login'] + rich_releases_author_dict['name'] = '' + rich_releases_dict['author'] = rich_releases_author_dict + rich_releases.append(rich_releases_dict) rich_repo['releases'] = rich_releases rich_repo['releases_count'] = len(rich_releases) From 77d40995ab556f548d3d964ddf37aa75367c4bed Mon Sep 17 00:00:00 2001 From: Chenqi Shan Date: Mon, 19 Sep 2022 17:27:36 +0800 Subject: [PATCH 06/21] support opensearch 2.X and add commits_data for pr Signed-off-by: Chenqi Shan --- grimoire_elk/elastic.py | 2 ++ grimoire_elk/enriched/github.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/grimoire_elk/elastic.py b/grimoire_elk/elastic.py index 87a502c0b..d5fd932b8 100644 --- a/grimoire_elk/elastic.py +++ b/grimoire_elk/elastic.py @@ -299,6 +299,7 @@ def get_bulk_url(self): """Get the bulk URL endpoint""" if (self.major == '7' and self.distribution == 'elasticsearch') or \ + (self.major == '2' and self.distribution == 'opensearch') or \ (self.major == '1' and self.distribution == 'opensearch'): bulk_url = self.index_url + '/_bulk' else: @@ -312,6 +313,7 @@ def get_mapping_url(self, _type=None): :param _type: type of the mapping. In case of ES7, it is None """ if (self.major == '7' and self.distribution == 'elasticsearch') or \ + (self.major == '2' and self.distribution == 'opensearch') or \ (self.major == '1' and self.distribution == 'opensearch'): mapping_url = self.index_url + "/_mapping" else: diff --git a/grimoire_elk/enriched/github.py b/grimoire_elk/enriched/github.py index 6641e8452..0591e4847 100644 --- a/grimoire_elk/enriched/github.py +++ b/grimoire_elk/enriched/github.py @@ -584,6 +584,7 @@ def __get_rich_pull(self, item): if 'linked_issues_data' in pull_request: rich_pr['linked_issues_count'] = pull_request['linked_issues_data'] + rich_pr['commits_data'] = pull_request['commits_data'] if self.prjs_map: rich_pr.update(self.get_item_project(rich_pr)) @@ -674,6 +675,7 @@ def __get_rich_issue(self, item): rich_issue['github_repo'] = rich_issue['repository'].replace(GITHUB, '') rich_issue['github_repo'] = re.sub('.git$', '', rich_issue['github_repo']) rich_issue["url_id"] = rich_issue['github_repo'] + "/issues/" + rich_issue['id_in_repo'] + rich_issue['body'] = issue['body'] if self.prjs_map: rich_issue.update(self.get_item_project(rich_issue)) From 936eb7c73f56fae66f1535d02d651f7945a4f074 Mon Sep 17 00:00:00 2001 From: lishengbao Date: Thu, 15 Dec 2022 16:39:24 +0800 Subject: [PATCH 07/21] raw phase add release mapping Signed-off-by: lishengbao --- grimoire_elk/enriched/github.py | 2 +- grimoire_elk/raw/github.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/grimoire_elk/enriched/github.py b/grimoire_elk/enriched/github.py index 3f4204b52..42e61b75d 100644 --- a/grimoire_elk/enriched/github.py +++ b/grimoire_elk/enriched/github.py @@ -725,7 +725,7 @@ def __get_rich_repo(self, item): rich_releases_dict['target_commitish'] = release['target_commitish'] rich_releases_dict['prerelease'] = release['prerelease'] rich_releases_dict['name'] = release['name'] - rich_releases_dict['body'] = release['body'] + rich_releases_dict['body'] = '' rich_releases_dict['created_at'] = release['created_at'] release_author = release['author'] rich_releases_author_dict = {} diff --git a/grimoire_elk/raw/github.py b/grimoire_elk/raw/github.py index 687702bec..3f4a97598 100644 --- a/grimoire_elk/raw/github.py +++ b/grimoire_elk/raw/github.py @@ -74,6 +74,15 @@ def get_elastic_mappings(es_major): "body": { "type": "text", "index": true + }, + "releases": { + "dynamic":false, + "properties": { + "body": { + "type": "text", + "index": true + } + } } } } From 57f7eafd2405f0b2225ba6730de211b204bfbad5 Mon Sep 17 00:00:00 2001 From: lishengbao Date: Fri, 20 Jan 2023 14:19:22 +0800 Subject: [PATCH 08/21] Add 'email' field Signed-off-by: lishengbao --- grimoire_elk/enriched/git.py | 1 + grimoire_elk/enriched/github.py | 4 ++++ grimoire_elk/enriched/github2.py | 4 ++++ 3 files changed, 9 insertions(+) diff --git a/grimoire_elk/enriched/git.py b/grimoire_elk/enriched/git.py index 9d116184b..044baacef 100644 --- a/grimoire_elk/enriched/git.py +++ b/grimoire_elk/enriched/git.py @@ -293,6 +293,7 @@ def get_rich_item(self, item): # author_name and author_domain are added always identity = self.get_sh_identity(commit["Author"]) eitem["author_name"] = identity['name'] + eitem["author_email"] = identity['email'] eitem["author_domain"] = self.get_identity_domain(identity) # committer data diff --git a/grimoire_elk/enriched/github.py b/grimoire_elk/enriched/github.py index 42e61b75d..1df7ddcbe 100644 --- a/grimoire_elk/enriched/github.py +++ b/grimoire_elk/enriched/github.py @@ -509,6 +509,7 @@ def __get_rich_pull(self, item): if self.__has_user(user): rich_pr['user_name'] = user['name'] rich_pr['author_name'] = user['name'] + rich_pr['user_email'] = user.get('email', None) rich_pr["user_domain"] = self.get_email_domain(user['email']) if user['email'] else None rich_pr['user_org'] = user['company'] rich_pr['user_location'] = user['location'] @@ -520,6 +521,7 @@ def __get_rich_pull(self, item): rich_pr['user_location'] = None rich_pr['user_geolocation'] = None rich_pr['author_name'] = None + rich_pr['user_email'] = None merged_by = pull_request.get('merged_by_data', None) if merged_by and merged_by is not None: @@ -621,6 +623,7 @@ def __get_rich_issue(self, item): if self.__has_user(user): rich_issue['user_name'] = user['name'] rich_issue['author_name'] = user['name'] + rich_issue['user_email'] = user.get('email', None) rich_issue["user_domain"] = self.get_email_domain(user['email']) if user['email'] else None rich_issue['user_org'] = user['company'] rich_issue['user_location'] = user['location'] @@ -632,6 +635,7 @@ def __get_rich_issue(self, item): rich_issue['user_location'] = None rich_issue['user_geolocation'] = None rich_issue['author_name'] = None + rich_issue['user_email'] = None assignee = issue.get('assignee_data', None) if self.__has_user(assignee): diff --git a/grimoire_elk/enriched/github2.py b/grimoire_elk/enriched/github2.py index 61e1698d6..7b01d66bf 100644 --- a/grimoire_elk/enriched/github2.py +++ b/grimoire_elk/enriched/github2.py @@ -510,6 +510,7 @@ def __get_rich_pull(self, item): if user is not None and user: rich_pr['user_name'] = user['name'] rich_pr['author_name'] = user['name'] + rich_pr['user_email'] = user.get('email', None) rich_pr["user_domain"] = self.get_email_domain(user['email']) if user['email'] else None rich_pr['user_org'] = user['company'] rich_pr['user_location'] = user['location'] @@ -521,6 +522,7 @@ def __get_rich_pull(self, item): rich_pr['user_location'] = None rich_pr['user_geolocation'] = None rich_pr['author_name'] = None + rich_pr['user_email'] = None merged_by = pull_request.get('merged_by_data', None) if merged_by and merged_by != USER_NOT_AVAILABLE: @@ -618,6 +620,7 @@ def __get_rich_issue(self, item): if user is not None and user: rich_issue['user_name'] = user['name'] rich_issue['author_name'] = user['name'] + rich_issue['user_email'] = user.get('email', None) rich_issue["user_domain"] = self.get_email_domain(user['email']) if user['email'] else None rich_issue['user_org'] = user['company'] rich_issue['user_location'] = user['location'] @@ -629,6 +632,7 @@ def __get_rich_issue(self, item): rich_issue['user_location'] = None rich_issue['user_geolocation'] = None rich_issue['author_name'] = None + rich_issue['user_email'] = None assignee = issue.get('assignee_data', None) if assignee and assignee != USER_NOT_AVAILABLE: From 7c27dd833a98f91c478ac629e5cd5e9da1474902 Mon Sep 17 00:00:00 2001 From: lishengbao Date: Tue, 6 Jun 2023 09:37:57 +0800 Subject: [PATCH 09/21] Fix the bug of reaction['user'] being NoneType in issues Signed-off-by: lishengbao --- grimoire_elk/enriched/github.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/grimoire_elk/enriched/github.py b/grimoire_elk/enriched/github.py index 1df7ddcbe..1a244725b 100644 --- a/grimoire_elk/enriched/github.py +++ b/grimoire_elk/enriched/github.py @@ -176,7 +176,11 @@ def get_time_to_first_attention(self, item): comment_dates = [str_to_datetime(comment['created_at']) for comment in item['comments_data'] if item['user']['login'] != comment['user']['login']] reaction_dates = [str_to_datetime(reaction['created_at']) for reaction in item['reactions_data'] - if item['user']['login'] != reaction['user']['login']] + if item.get('user') is not None and + item['user'].get('login') is not None and + reaction.get('user') is not None and + reaction['user'].get('login') is not None and + item['user']['login'] != reaction['user']['login']] reaction_dates.extend(comment_dates) if reaction_dates: return min(reaction_dates) From 6038493836616249fa9ed3035feb7a3130282243 Mon Sep 17 00:00:00 2001 From: lishengbao Date: Thu, 17 Aug 2023 16:19:37 +0800 Subject: [PATCH 10/21] Add more event types Signed-off-by: lishengbao --- grimoire_elk/enriched/githubql.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/grimoire_elk/enriched/githubql.py b/grimoire_elk/enriched/githubql.py index 8e6947e36..993f97f01 100644 --- a/grimoire_elk/enriched/githubql.py +++ b/grimoire_elk/enriched/githubql.py @@ -37,6 +37,13 @@ CLOSED_EVENTS = ['ClosedEvent'] MERGED_EVENTS = ['MergedEvent'] PULL_REQUEST_REVIEW_EVENTS = ['PullRequestReview'] +REOPENED_EVENT = ['ReopenedEvent'] +ASSIGNED_EVENT = ['AssignedEvent'] +LOCKED_EVENT = ['LockedEvent'] +MILESTONED_EVENT = ['MilestonedEvent'] +MARKED_AS_DUPLICATE_EVENT = ['MarkedAsDuplicateEvent'] +TRANSFERRED_EVENT = ['TransferredEvent'] + logger = logging.getLogger(__name__) @@ -171,6 +178,7 @@ def __get_rich_event(self, item): rich_event['event_type'] = event['eventType'] rich_event['created_at'] = event['createdAt'] rich_event['actor_username'] = actor['login'] if actor else None + rich_event['user_login'] = rich_event['actor_username'] rich_event['repository'] = self.get_project_repository(rich_event) rich_event['pull_request'] = True rich_event['item_type'] = 'pull request' @@ -276,6 +284,25 @@ def __get_rich_event(self, item): rich_event['merge_updated_at'] = review['updatedAt'] rich_event['merge_url'] = review['url'] item['data']['actor'] = item['data']['author'] + elif rich_event['event_type'] in ASSIGNED_EVENT: + assignee_usernames = [ edge['node']['login'] for edge in event['assignable']['assignees']['edges']] + rich_event['assignee_usernames'] = assignee_usernames + elif rich_event['event_type'] in MARKED_AS_DUPLICATE_EVENT: + duplicate = event['canonical'] + rich_event['duplicate_cross_repo'] = event['isCrossRepository'] + rich_event['duplicate_number'] = duplicate['number'] + rich_event['duplicate_url'] = duplicate['url'] + rich_event['duplicate_repo'] = '/'.join(duplicate['url'].replace(GITHUB, '').split('/')[:-2]) + rich_event['duplicate_created_at'] = duplicate['createdAt'] + rich_event['duplicate_updated_at'] = duplicate['updatedAt'] + rich_event['duplicate_closed_at'] = duplicate['closedAt'] + rich_event['duplicate_closed'] = duplicate['closed'] + rich_event['duplicate_merged'] = duplicate.get('merged', None) + elif rich_event['event_type'] in TRANSFERRED_EVENT: + from_repository = event['fromRepository'] + rich_event['from_repo_id'] = from_repository['id'] + rich_event['from_repo_url'] = from_repository['url'] + rich_event['from_repo'] = from_repository['url'].replace(GITHUB, '') else: logger.warning("[github] event {} not processed".format(rich_event['event_type'])) From eed43d3c9ebccba356f0a49ae9878bc9b28ca39c Mon Sep 17 00:00:00 2001 From: lishengbao Date: Thu, 17 Aug 2023 16:20:24 +0800 Subject: [PATCH 11/21] Add stargazer and fork category to the githubql.py Signed-off-by: lishengbao --- grimoire_elk/enriched/githubql.py | 51 ++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/grimoire_elk/enriched/githubql.py b/grimoire_elk/enriched/githubql.py index 993f97f01..a92cc333f 100644 --- a/grimoire_elk/enriched/githubql.py +++ b/grimoire_elk/enriched/githubql.py @@ -154,7 +154,16 @@ def get_project_repository(self, eitem): @metadata def get_rich_item(self, item): - rich_item = self.__get_rich_event(item) + rich_item = {} + if item['category'] == 'event': + rich_item = self.__get_rich_event(item) + elif item['category'] == 'stargazer': + rich_item = self.__get_rich_stargazer(item) + elif item['category'] == 'fork': + rich_item = self.__get_rich_fork(item) + else: + logger.error("[githubql] rich item not defined for GitHubQL category {}".format( + item['category'])) self.add_repository_labels(rich_item) self.add_metadata_filter_raw(rich_item) @@ -327,6 +336,46 @@ def __get_rich_event(self, item): return rich_event + def __get_rich_stargazer(self, item): + rich_stargazer = {} + + self.copy_raw_fields(self.RAW_FIELDS_COPY, item, rich_stargazer) + + stargazer = item['data'] + rich_stargazer['user_login'] = stargazer['login'] + rich_stargazer['user_id'] = stargazer['id'] + rich_stargazer['user_name'] = stargazer.get('name', None) + rich_stargazer['auhtor_name'] = stargazer.get('name', None) + rich_stargazer['user_email'] = stargazer.get('email', None) + rich_stargazer['user_company'] = stargazer.get('company', None) + rich_stargazer['created_at'] = stargazer['createdAt'] + + if self.prjs_map: + rich_stargazer.update(self.get_item_project(rich_stargazer)) + + rich_stargazer.update(self.get_grimoire_fields(stargazer['createdAt'], "stargazer")) + + return rich_stargazer + + def __get_rich_fork(self, item): + rich_fork = {} + + self.copy_raw_fields(self.RAW_FIELDS_COPY, item, rich_fork) + + fork = item['data'] + rich_fork['frok_id'] = fork['id'] + rich_fork['frok_url'] = fork['url'] + rich_fork['user_login'] = fork['owner']['login'] + rich_fork['user_id'] = fork['owner']['id'] + rich_fork['created_at'] = fork['createdAt'] + + if self.prjs_map: + rich_fork.update(self.get_item_project(rich_fork)) + + rich_fork.update(self.get_grimoire_fields(fork['createdAt'], "fork")) + + return rich_fork + def enrich_duration_analysis(self, ocean_backend, enrich_backend, start_event_type, target_attr, fltr_event_types, fltr_attr=None, page_size=200): """The purpose of this study is to calculate the duration between two GitHub events. It requires From 001c16e7c91755eae22de4458f2db8ef81c8862b Mon Sep 17 00:00:00 2001 From: lishengbao Date: Sat, 26 Aug 2023 17:49:30 +0800 Subject: [PATCH 12/21] Add archived_at time in github:repo Signed-off-by: lishengbao --- grimoire_elk/enriched/github.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/grimoire_elk/enriched/github.py b/grimoire_elk/enriched/github.py index 1a244725b..3278d0dd1 100644 --- a/grimoire_elk/enriched/github.py +++ b/grimoire_elk/enriched/github.py @@ -722,6 +722,11 @@ def __get_rich_repo(self, item): rich_repo['stargazers_count'] = repo['stargazers_count'] rich_repo['fetched_on'] = repo['fetched_on'] rich_repo['url'] = repo['html_url'] + rich_repo['archived'] = repo['archived'] + rich_repo['archivedAt'] = repo.get('archivedAt') + rich_repo['created_at'] = repo['created_at'] + rich_repo['updated_at'] = repo['updated_at'] + rich_releases = [] releases = repo.get('releases') From f021f547d0de23dab819266164c59203bb245e96 Mon Sep 17 00:00:00 2001 From: lishengbao Date: Sat, 26 Aug 2023 18:01:13 +0800 Subject: [PATCH 13/21] Fix release author may be None Signed-off-by: lishengbao --- grimoire_elk/enriched/github.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/grimoire_elk/enriched/github.py b/grimoire_elk/enriched/github.py index 3278d0dd1..35d614379 100644 --- a/grimoire_elk/enriched/github.py +++ b/grimoire_elk/enriched/github.py @@ -742,7 +742,10 @@ def __get_rich_repo(self, item): rich_releases_dict['created_at'] = release['created_at'] release_author = release['author'] rich_releases_author_dict = {} - rich_releases_author_dict['login'] = release_author['login'] + if release_author is None: + rich_releases_author_dict['login'] = '' + else: + rich_releases_author_dict['login'] = release_author['login'] rich_releases_author_dict['name'] = '' rich_releases_dict['author'] = rich_releases_author_dict rich_releases.append(rich_releases_dict) From d770408c69d4fff394ee1f72e05efac068cb0cb7 Mon Sep 17 00:00:00 2001 From: lishengbao Date: Mon, 25 Sep 2023 09:59:22 +0800 Subject: [PATCH 14/21] Adding the 'committer_email' and 'parents' fields to git Signed-off-by: lishengbao --- grimoire_elk/enriched/git.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/grimoire_elk/enriched/git.py b/grimoire_elk/enriched/git.py index 044baacef..d7a926101 100644 --- a/grimoire_elk/enriched/git.py +++ b/grimoire_elk/enriched/git.py @@ -229,7 +229,8 @@ def get_rich_item(self, item): if 'refs' in commit: eitem["commit_tags"] = list(filter(lambda r: "tag: " in r, commit['refs'])) - + + eitem['parents'] = commit['parents'] eitem['hash_short'] = eitem['hash'][0:6] # Enrich dates author_date = str_to_datetime(commit["AuthorDate"]) @@ -299,6 +300,7 @@ def get_rich_item(self, item): # committer data identity = self.get_sh_identity(commit["Commit"]) eitem["committer_name"] = identity['name'] + eitem["committer_email"] = identity['email'] eitem["committer_domain"] = self.get_identity_domain(identity) # title from first line From aa395c30f5b4a3f076c526cd483ddad961f61022 Mon Sep 17 00:00:00 2001 From: lishengbao Date: Mon, 25 Sep 2023 10:00:43 +0800 Subject: [PATCH 15/21] Adding the 'assignees_login', 'requested_reviewers_login', 'reviewers_login' and 'merge_commit_sha' fields to github Signed-off-by: lishengbao --- grimoire_elk/enriched/github.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/grimoire_elk/enriched/github.py b/grimoire_elk/enriched/github.py index 35d614379..a004ccda4 100644 --- a/grimoire_elk/enriched/github.py +++ b/grimoire_elk/enriched/github.py @@ -542,6 +542,15 @@ def __get_rich_pull(self, item): rich_pr['merge_author_org'] = None rich_pr['merge_author_location'] = None rich_pr['merge_author_geolocation'] = None + assignees_login = set() + [assignees_login.add(assignee.get('login')) for assignee in pull_request['assignees'] if 'assignees' in pull_request] + rich_pr['assignees_login'] = list(assignees_login) + requested_reviewers_login = set() + [requested_reviewers_login.add(requested_reviewer.get('login')) for requested_reviewer in pull_request['requested_reviewers'] if 'requested_reviewers' in pull_request] + rich_pr['requested_reviewers_login'] = list(requested_reviewers_login) + reviewers_login = set() + [reviewers_login.add(reviewer.get('user').get('login')) for reviewer in pull_request['reviews_data'] if 'reviews_data' in pull_request] + rich_pr['reviewers_login'] = list(reviewers_login) rich_pr['id'] = pull_request['id'] rich_pr['id_in_repo'] = pull_request['html_url'].split("/")[-1] @@ -558,6 +567,7 @@ def __get_rich_pull(self, item): rich_pr['additions'] = pull_request['additions'] rich_pr['deletions'] = pull_request['deletions'] rich_pr['changed_files'] = pull_request['changed_files'] + rich_pr['merge_commit_sha'] = pull_request['merge_commit_sha'] # Adding this field for consistency with the rest of github-related enrichers rich_pr['issue_url'] = pull_request['html_url'] labels = [] From 9885f227841e7dddbdbb34eb74ee9d6b9ad5ac67 Mon Sep 17 00:00:00 2001 From: lishengbao Date: Thu, 26 Oct 2023 17:10:26 +0800 Subject: [PATCH 16/21] Remove LockedEvent Signed-off-by: lishengbao --- grimoire_elk/enriched/githubql.py | 1 - 1 file changed, 1 deletion(-) diff --git a/grimoire_elk/enriched/githubql.py b/grimoire_elk/enriched/githubql.py index a92cc333f..28a650a24 100644 --- a/grimoire_elk/enriched/githubql.py +++ b/grimoire_elk/enriched/githubql.py @@ -39,7 +39,6 @@ PULL_REQUEST_REVIEW_EVENTS = ['PullRequestReview'] REOPENED_EVENT = ['ReopenedEvent'] ASSIGNED_EVENT = ['AssignedEvent'] -LOCKED_EVENT = ['LockedEvent'] MILESTONED_EVENT = ['MilestonedEvent'] MARKED_AS_DUPLICATE_EVENT = ['MarkedAsDuplicateEvent'] TRANSFERRED_EVENT = ['TransferredEvent'] From 9fc15b1f71e7ec339e661172a46ccb40a479594f Mon Sep 17 00:00:00 2001 From: lishengbao Date: Thu, 26 Oct 2023 17:13:01 +0800 Subject: [PATCH 17/21] Add UnassignedEvent, DemilestonedEvent, RenameTitleEvent Signed-off-by: lishengbao --- grimoire_elk/enriched/githubql.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/grimoire_elk/enriched/githubql.py b/grimoire_elk/enriched/githubql.py index 28a650a24..200386264 100644 --- a/grimoire_elk/enriched/githubql.py +++ b/grimoire_elk/enriched/githubql.py @@ -38,10 +38,11 @@ MERGED_EVENTS = ['MergedEvent'] PULL_REQUEST_REVIEW_EVENTS = ['PullRequestReview'] REOPENED_EVENT = ['ReopenedEvent'] -ASSIGNED_EVENT = ['AssignedEvent'] -MILESTONED_EVENT = ['MilestonedEvent'] +ASSIGNED_EVENT = ['AssignedEvent', 'UnassignedEvent'] +MILESTONED_EVENT = ['MilestonedEvent', 'DemilestonedEvent'] MARKED_AS_DUPLICATE_EVENT = ['MarkedAsDuplicateEvent'] TRANSFERRED_EVENT = ['TransferredEvent'] +TITLE_EVENT = ['RenamedTitleEvent'] logger = logging.getLogger(__name__) @@ -293,8 +294,7 @@ def __get_rich_event(self, item): rich_event['merge_url'] = review['url'] item['data']['actor'] = item['data']['author'] elif rich_event['event_type'] in ASSIGNED_EVENT: - assignee_usernames = [ edge['node']['login'] for edge in event['assignable']['assignees']['edges']] - rich_event['assignee_usernames'] = assignee_usernames + rich_event['assignee_login'] = event.get("assignee", None).get("login", None) elif rich_event['event_type'] in MARKED_AS_DUPLICATE_EVENT: duplicate = event['canonical'] rich_event['duplicate_cross_repo'] = event['isCrossRepository'] @@ -311,6 +311,9 @@ def __get_rich_event(self, item): rich_event['from_repo_id'] = from_repository['id'] rich_event['from_repo_url'] = from_repository['url'] rich_event['from_repo'] = from_repository['url'].replace(GITHUB, '') + elif rich_event['event_type'] in TITLE_EVENT: + rich_event['current_title'] = event['currentTitle'] + rich_event['previous_title'] = event['previousTitle'] else: logger.warning("[github] event {} not processed".format(rich_event['event_type'])) From c42737759a9e43e44787ea7909719db3385ca977 Mon Sep 17 00:00:00 2001 From: lishengbao Date: Thu, 30 Nov 2023 12:34:14 +0800 Subject: [PATCH 18/21] Fix NoneType bug Signed-off-by: lishengbao --- grimoire_elk/enriched/github.py | 11 ++++++++--- grimoire_elk/enriched/githubql.py | 4 +++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/grimoire_elk/enriched/github.py b/grimoire_elk/enriched/github.py index a004ccda4..34d026a08 100644 --- a/grimoire_elk/enriched/github.py +++ b/grimoire_elk/enriched/github.py @@ -543,13 +543,18 @@ def __get_rich_pull(self, item): rich_pr['merge_author_location'] = None rich_pr['merge_author_geolocation'] = None assignees_login = set() - [assignees_login.add(assignee.get('login')) for assignee in pull_request['assignees'] if 'assignees' in pull_request] + for assignee in pull_request.get('assignees', []): + assignees_login.add(assignee.get('login')) rich_pr['assignees_login'] = list(assignees_login) requested_reviewers_login = set() - [requested_reviewers_login.add(requested_reviewer.get('login')) for requested_reviewer in pull_request['requested_reviewers'] if 'requested_reviewers' in pull_request] + for requested_reviewer in pull_request.get('requested_reviewers', []): + requested_reviewers_login.add(requested_reviewer.get('login')) rich_pr['requested_reviewers_login'] = list(requested_reviewers_login) reviewers_login = set() - [reviewers_login.add(reviewer.get('user').get('login')) for reviewer in pull_request['reviews_data'] if 'reviews_data' in pull_request] + for reviewer in pull_request.get('reviews_data', []): + reviewer_user = reviewer.get('user') + if reviewer_user is not None: + reviewers_login.add(reviewer_user.get('login')) rich_pr['reviewers_login'] = list(reviewers_login) rich_pr['id'] = pull_request['id'] diff --git a/grimoire_elk/enriched/githubql.py b/grimoire_elk/enriched/githubql.py index 200386264..cbbfa0ab4 100644 --- a/grimoire_elk/enriched/githubql.py +++ b/grimoire_elk/enriched/githubql.py @@ -294,7 +294,9 @@ def __get_rich_event(self, item): rich_event['merge_url'] = review['url'] item['data']['actor'] = item['data']['author'] elif rich_event['event_type'] in ASSIGNED_EVENT: - rich_event['assignee_login'] = event.get("assignee", None).get("login", None) + event_assignee = event.get("assignee") + if event_assignee is not None: + rich_event['assignee_login'] = event_assignee.get("login", None) elif rich_event['event_type'] in MARKED_AS_DUPLICATE_EVENT: duplicate = event['canonical'] rich_event['duplicate_cross_repo'] = event['isCrossRepository'] From 2cb947d4a801abc09ccd7dfac6fd06119b31d061 Mon Sep 17 00:00:00 2001 From: lishengbao Date: Wed, 6 Mar 2024 16:34:45 +0800 Subject: [PATCH 19/21] Add the 'topics' field to the repo stage Signed-off-by: lishengbao --- grimoire_elk/enriched/github.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/grimoire_elk/enriched/github.py b/grimoire_elk/enriched/github.py index 34d026a08..7fde5c8cd 100644 --- a/grimoire_elk/enriched/github.py +++ b/grimoire_elk/enriched/github.py @@ -767,6 +767,8 @@ def __get_rich_repo(self, item): rich_repo['releases'] = rich_releases rich_repo['releases_count'] = len(rich_releases) + rich_repo['topics'] = repo.get("topics", []) + if self.prjs_map: rich_repo.update(self.get_item_project(rich_repo)) From 364eaaf605c98c5088068faa789ac02fd9d50494 Mon Sep 17 00:00:00 2001 From: lishengbao Date: Wed, 15 May 2024 16:13:34 +0800 Subject: [PATCH 20/21] Fix git branch bug Signed-off-by: lishengbao --- grimoire_elk/enriched/git.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/grimoire_elk/enriched/git.py b/grimoire_elk/enriched/git.py index d7a926101..07a0ead45 100644 --- a/grimoire_elk/enriched/git.py +++ b/grimoire_elk/enriched/git.py @@ -1043,6 +1043,10 @@ def add_commit_branches(self, git_repo, enrich_backend): if commit_count: self.__process_commits_in_branch(enrich_backend, git_repo.uri, branch_name, to_process) + # reset the counter + to_process = [] + commit_count = 0 + except Exception as e: logger.error("[git] Skip adding branch info for repo {} due to {}".format(git_repo.uri, e)) return From 25dff55bc0772712f47fe27c88f48c3b96d8a856 Mon Sep 17 00:00:00 2001 From: lishengbao Date: Tue, 17 Dec 2024 14:35:00 +0800 Subject: [PATCH 21/21] [enriched-git] Handle connection problems during enrich_git_branches This code allows to handle connection problems when accessing the repo to Signed-off-by: lishengbao --- grimoire_elk/enriched/git.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/grimoire_elk/enriched/git.py b/grimoire_elk/enriched/git.py index 07a0ead45..a5c2eb33e 100644 --- a/grimoire_elk/enriched/git.py +++ b/grimoire_elk/enriched/git.py @@ -950,7 +950,11 @@ def enrich_git_branches(self, ocean_backend, enrich_backend, run_month_days=[7, logger.debug("[git] study git-branches delete branch info for repo {} in index {}".format( git_repo.uri, anonymize_url(enrich_backend.elastic.index_url))) - self.delete_commit_branches(git_repo, enrich_backend) + try: + self.delete_commit_branches(git_repo, enrich_backend) + except Exception as e: + logger.error("[git] study git-branches delete failed on repo {}, due to {}".format(git_repo.uri, e)) + continue logger.debug("[git] study git-branches add branch info for repo {} in index {}".format( git_repo.uri, anonymize_url(enrich_backend.elastic.index_url)))