diff --git a/grimoire_elk/elastic.py b/grimoire_elk/elastic.py index 87a502c0b..d5fd932b8 100644 --- a/grimoire_elk/elastic.py +++ b/grimoire_elk/elastic.py @@ -299,6 +299,7 @@ def get_bulk_url(self): """Get the bulk URL endpoint""" if (self.major == '7' and self.distribution == 'elasticsearch') or \ + (self.major == '2' and self.distribution == 'opensearch') or \ (self.major == '1' and self.distribution == 'opensearch'): bulk_url = self.index_url + '/_bulk' else: @@ -312,6 +313,7 @@ def get_mapping_url(self, _type=None): :param _type: type of the mapping. In case of ES7, it is None """ if (self.major == '7' and self.distribution == 'elasticsearch') or \ + (self.major == '2' and self.distribution == 'opensearch') or \ (self.major == '1' and self.distribution == 'opensearch'): mapping_url = self.index_url + "/_mapping" else: diff --git a/grimoire_elk/enriched/git.py b/grimoire_elk/enriched/git.py index 9d116184b..a5c2eb33e 100644 --- a/grimoire_elk/enriched/git.py +++ b/grimoire_elk/enriched/git.py @@ -229,7 +229,8 @@ def get_rich_item(self, item): if 'refs' in commit: eitem["commit_tags"] = list(filter(lambda r: "tag: " in r, commit['refs'])) - + + eitem['parents'] = commit['parents'] eitem['hash_short'] = eitem['hash'][0:6] # Enrich dates author_date = str_to_datetime(commit["AuthorDate"]) @@ -293,11 +294,13 @@ def get_rich_item(self, item): # author_name and author_domain are added always identity = self.get_sh_identity(commit["Author"]) eitem["author_name"] = identity['name'] + eitem["author_email"] = identity['email'] eitem["author_domain"] = self.get_identity_domain(identity) # committer data identity = self.get_sh_identity(commit["Commit"]) eitem["committer_name"] = identity['name'] + eitem["committer_email"] = identity['email'] eitem["committer_domain"] = self.get_identity_domain(identity) # title from first line @@ -947,7 +950,11 @@ def enrich_git_branches(self, ocean_backend, enrich_backend, run_month_days=[7, logger.debug("[git] study git-branches delete branch info for repo {} in index {}".format( git_repo.uri, anonymize_url(enrich_backend.elastic.index_url))) - self.delete_commit_branches(git_repo, enrich_backend) + try: + self.delete_commit_branches(git_repo, enrich_backend) + except Exception as e: + logger.error("[git] study git-branches delete failed on repo {}, due to {}".format(git_repo.uri, e)) + continue logger.debug("[git] study git-branches add branch info for repo {} in index {}".format( git_repo.uri, anonymize_url(enrich_backend.elastic.index_url))) @@ -1040,6 +1047,10 @@ def add_commit_branches(self, git_repo, enrich_backend): if commit_count: self.__process_commits_in_branch(enrich_backend, git_repo.uri, branch_name, to_process) + # reset the counter + to_process = [] + commit_count = 0 + except Exception as e: logger.error("[git] Skip adding branch info for repo {} due to {}".format(git_repo.uri, e)) return diff --git a/grimoire_elk/enriched/github.py b/grimoire_elk/enriched/github.py index 72908e2de..7fde5c8cd 100644 --- a/grimoire_elk/enriched/github.py +++ b/grimoire_elk/enriched/github.py @@ -176,7 +176,11 @@ def get_time_to_first_attention(self, item): comment_dates = [str_to_datetime(comment['created_at']) for comment in item['comments_data'] if item['user']['login'] != comment['user']['login']] reaction_dates = [str_to_datetime(reaction['created_at']) for reaction in item['reactions_data'] - if item['user']['login'] != reaction['user']['login']] + if item.get('user') is not None and + item['user'].get('login') is not None and + reaction.get('user') is not None and + reaction['user'].get('login') is not None and + item['user']['login'] != reaction['user']['login']] reaction_dates.extend(comment_dates) if reaction_dates: return min(reaction_dates) @@ -203,6 +207,58 @@ def get_time_to_merge_request_response(self, item): return None + #get comments and exclude bot + def get_num_of_comments_without_bot(self, item): + """Get the num of comment was made to the issue by someone + other than the user who created the issue and bot + """ + comments = [comment for comment in item['comments_data'] + if item['user']['login'] != comment['user']['login'] \ + and 'bot' not in comment['user']['login']] + return len(comments) + + def get_num_of_reviews_without_bot(self, item): + """Get the num of comment was made to the issue by someone + other than the user who created the issue and bot + """ + review_comments = [] + for comment in item['review_comments_data']: + # skip comments of ghost users + if not comment['user']: + continue + + # skip comments of the pull request creator + if 'bot' in comment['user']['login'] or item['user']['login'] == comment['user']['login'] : + continue + + review_comments.append(comment) + + return len(review_comments) + + #get first attendtion without bot + def get_time_to_first_attention_without_bot(self, item): + """Get the first date at which a comment was made to the issue by someone + other than the user who created the issue and bot + """ + comment_dates = [str_to_datetime(comment['created_at']) for comment in item['comments_data'] + if item['user']['login'] != comment['user']['login'] and 'bot' not in comment['user']['login']] + if comment_dates: + return min(comment_dates) + return None + + #get first attendtion without bot + def get_time_to_first_review_attention_without_bot(self, item): + """Get the first date at which a comment was made to the issue by someone + other than the user who created the issue and bot + """ + if 'review_comments_data' in item and item['review_comments_data']: + comment_dates = [str_to_datetime(comment['created_at']) for comment in item['review_comments_data'] + if 'login' in item['user'] and comment['user'] and item['user']['login'] != comment['user']['login'] and 'bot' not in comment['user']['login']] + if comment_dates: + return min(comment_dates) + else: + return None + def get_latest_comment_date(self, item): """Get the date of the latest comment on the issue/pr""" @@ -457,6 +513,7 @@ def __get_rich_pull(self, item): if self.__has_user(user): rich_pr['user_name'] = user['name'] rich_pr['author_name'] = user['name'] + rich_pr['user_email'] = user.get('email', None) rich_pr["user_domain"] = self.get_email_domain(user['email']) if user['email'] else None rich_pr['user_org'] = user['company'] rich_pr['user_location'] = user['location'] @@ -468,6 +525,7 @@ def __get_rich_pull(self, item): rich_pr['user_location'] = None rich_pr['user_geolocation'] = None rich_pr['author_name'] = None + rich_pr['user_email'] = None merged_by = pull_request.get('merged_by_data', None) if merged_by and merged_by is not None: @@ -484,6 +542,20 @@ def __get_rich_pull(self, item): rich_pr['merge_author_org'] = None rich_pr['merge_author_location'] = None rich_pr['merge_author_geolocation'] = None + assignees_login = set() + for assignee in pull_request.get('assignees', []): + assignees_login.add(assignee.get('login')) + rich_pr['assignees_login'] = list(assignees_login) + requested_reviewers_login = set() + for requested_reviewer in pull_request.get('requested_reviewers', []): + requested_reviewers_login.add(requested_reviewer.get('login')) + rich_pr['requested_reviewers_login'] = list(requested_reviewers_login) + reviewers_login = set() + for reviewer in pull_request.get('reviews_data', []): + reviewer_user = reviewer.get('user') + if reviewer_user is not None: + reviewers_login.add(reviewer_user.get('login')) + rich_pr['reviewers_login'] = list(reviewers_login) rich_pr['id'] = pull_request['id'] rich_pr['id_in_repo'] = pull_request['html_url'].split("/")[-1] @@ -500,6 +572,7 @@ def __get_rich_pull(self, item): rich_pr['additions'] = pull_request['additions'] rich_pr['deletions'] = pull_request['deletions'] rich_pr['changed_files'] = pull_request['changed_files'] + rich_pr['merge_commit_sha'] = pull_request['merge_commit_sha'] # Adding this field for consistency with the rest of github-related enrichers rich_pr['issue_url'] = pull_request['html_url'] labels = [] @@ -524,6 +597,15 @@ def __get_rich_pull(self, item): min_review_date = self.get_time_to_merge_request_response(pull_request) rich_pr['time_to_merge_request_response'] = \ get_time_diff_days(str_to_datetime(pull_request['created_at']), min_review_date) + rich_pr['num_review_comments_without_bot'] = \ + self.get_num_of_reviews_without_bot(pull_request) + rich_pr['time_to_first_attention_without_bot'] = \ + get_time_diff_days(str_to_datetime(pull_request['created_at']), + self.get_time_to_first_review_attention_without_bot(pull_request)) + + if 'linked_issues_data' in pull_request: + rich_pr['linked_issues_count'] = pull_request['linked_issues_data'] + rich_pr['commits_data'] = pull_request['commits_data'] if self.prjs_map: rich_pr.update(self.get_item_project(rich_pr)) @@ -560,6 +642,7 @@ def __get_rich_issue(self, item): if self.__has_user(user): rich_issue['user_name'] = user['name'] rich_issue['author_name'] = user['name'] + rich_issue['user_email'] = user.get('email', None) rich_issue["user_domain"] = self.get_email_domain(user['email']) if user['email'] else None rich_issue['user_org'] = user['company'] rich_issue['user_location'] = user['location'] @@ -571,6 +654,7 @@ def __get_rich_issue(self, item): rich_issue['user_location'] = None rich_issue['user_geolocation'] = None rich_issue['author_name'] = None + rich_issue['user_email'] = None assignee = issue.get('assignee_data', None) if self.__has_user(assignee): @@ -614,6 +698,7 @@ def __get_rich_issue(self, item): rich_issue['github_repo'] = rich_issue['repository'].replace(GITHUB, '') rich_issue['github_repo'] = re.sub('.git$', '', rich_issue['github_repo']) rich_issue["url_id"] = rich_issue['github_repo'] + "/issues/" + rich_issue['id_in_repo'] + rich_issue['body'] = issue['body'] if self.prjs_map: rich_issue.update(self.get_item_project(rich_issue)) @@ -622,11 +707,17 @@ def __get_rich_issue(self, item): rich_issue['project'] = item['project'] rich_issue['time_to_first_attention'] = None + rich_issue['num_of_comments_without_bot'] = None if issue['comments'] + issue['reactions']['total_count'] != 0: rich_issue['time_to_first_attention'] = \ get_time_diff_days(str_to_datetime(issue['created_at']), self.get_time_to_first_attention(issue)) - + rich_issue['num_of_comments_without_bot'] = \ + self.get_num_of_comments_without_bot(issue) + rich_issue['time_to_first_attention_without_bot'] = \ + get_time_diff_days(str_to_datetime(issue['created_at']), + self.get_time_to_first_attention_without_bot(issue)) + rich_issue.update(self.get_grimoire_fields(issue['created_at'], "issue")) item[self.get_field_date()] = rich_issue[self.get_field_date()] @@ -646,6 +737,37 @@ def __get_rich_repo(self, item): rich_repo['stargazers_count'] = repo['stargazers_count'] rich_repo['fetched_on'] = repo['fetched_on'] rich_repo['url'] = repo['html_url'] + rich_repo['archived'] = repo['archived'] + rich_repo['archivedAt'] = repo.get('archivedAt') + rich_repo['created_at'] = repo['created_at'] + rich_repo['updated_at'] = repo['updated_at'] + + + rich_releases = [] + releases = repo.get('releases') + if releases: + for release in releases: + rich_releases_dict = {} + rich_releases_dict['id'] = release['id'] + rich_releases_dict['tag_name'] = release['tag_name'] + rich_releases_dict['target_commitish'] = release['target_commitish'] + rich_releases_dict['prerelease'] = release['prerelease'] + rich_releases_dict['name'] = release['name'] + rich_releases_dict['body'] = '' + rich_releases_dict['created_at'] = release['created_at'] + release_author = release['author'] + rich_releases_author_dict = {} + if release_author is None: + rich_releases_author_dict['login'] = '' + else: + rich_releases_author_dict['login'] = release_author['login'] + rich_releases_author_dict['name'] = '' + rich_releases_dict['author'] = rich_releases_author_dict + rich_releases.append(rich_releases_dict) + rich_repo['releases'] = rich_releases + rich_repo['releases_count'] = len(rich_releases) + + rich_repo['topics'] = repo.get("topics", []) if self.prjs_map: rich_repo.update(self.get_item_project(rich_repo)) diff --git a/grimoire_elk/enriched/github2.py b/grimoire_elk/enriched/github2.py index 61e1698d6..7b01d66bf 100644 --- a/grimoire_elk/enriched/github2.py +++ b/grimoire_elk/enriched/github2.py @@ -510,6 +510,7 @@ def __get_rich_pull(self, item): if user is not None and user: rich_pr['user_name'] = user['name'] rich_pr['author_name'] = user['name'] + rich_pr['user_email'] = user.get('email', None) rich_pr["user_domain"] = self.get_email_domain(user['email']) if user['email'] else None rich_pr['user_org'] = user['company'] rich_pr['user_location'] = user['location'] @@ -521,6 +522,7 @@ def __get_rich_pull(self, item): rich_pr['user_location'] = None rich_pr['user_geolocation'] = None rich_pr['author_name'] = None + rich_pr['user_email'] = None merged_by = pull_request.get('merged_by_data', None) if merged_by and merged_by != USER_NOT_AVAILABLE: @@ -618,6 +620,7 @@ def __get_rich_issue(self, item): if user is not None and user: rich_issue['user_name'] = user['name'] rich_issue['author_name'] = user['name'] + rich_issue['user_email'] = user.get('email', None) rich_issue["user_domain"] = self.get_email_domain(user['email']) if user['email'] else None rich_issue['user_org'] = user['company'] rich_issue['user_location'] = user['location'] @@ -629,6 +632,7 @@ def __get_rich_issue(self, item): rich_issue['user_location'] = None rich_issue['user_geolocation'] = None rich_issue['author_name'] = None + rich_issue['user_email'] = None assignee = issue.get('assignee_data', None) if assignee and assignee != USER_NOT_AVAILABLE: diff --git a/grimoire_elk/enriched/githubql.py b/grimoire_elk/enriched/githubql.py index 8e6947e36..cbbfa0ab4 100644 --- a/grimoire_elk/enriched/githubql.py +++ b/grimoire_elk/enriched/githubql.py @@ -37,6 +37,13 @@ CLOSED_EVENTS = ['ClosedEvent'] MERGED_EVENTS = ['MergedEvent'] PULL_REQUEST_REVIEW_EVENTS = ['PullRequestReview'] +REOPENED_EVENT = ['ReopenedEvent'] +ASSIGNED_EVENT = ['AssignedEvent', 'UnassignedEvent'] +MILESTONED_EVENT = ['MilestonedEvent', 'DemilestonedEvent'] +MARKED_AS_DUPLICATE_EVENT = ['MarkedAsDuplicateEvent'] +TRANSFERRED_EVENT = ['TransferredEvent'] +TITLE_EVENT = ['RenamedTitleEvent'] + logger = logging.getLogger(__name__) @@ -147,7 +154,16 @@ def get_project_repository(self, eitem): @metadata def get_rich_item(self, item): - rich_item = self.__get_rich_event(item) + rich_item = {} + if item['category'] == 'event': + rich_item = self.__get_rich_event(item) + elif item['category'] == 'stargazer': + rich_item = self.__get_rich_stargazer(item) + elif item['category'] == 'fork': + rich_item = self.__get_rich_fork(item) + else: + logger.error("[githubql] rich item not defined for GitHubQL category {}".format( + item['category'])) self.add_repository_labels(rich_item) self.add_metadata_filter_raw(rich_item) @@ -171,6 +187,7 @@ def __get_rich_event(self, item): rich_event['event_type'] = event['eventType'] rich_event['created_at'] = event['createdAt'] rich_event['actor_username'] = actor['login'] if actor else None + rich_event['user_login'] = rich_event['actor_username'] rich_event['repository'] = self.get_project_repository(rich_event) rich_event['pull_request'] = True rich_event['item_type'] = 'pull request' @@ -276,6 +293,29 @@ def __get_rich_event(self, item): rich_event['merge_updated_at'] = review['updatedAt'] rich_event['merge_url'] = review['url'] item['data']['actor'] = item['data']['author'] + elif rich_event['event_type'] in ASSIGNED_EVENT: + event_assignee = event.get("assignee") + if event_assignee is not None: + rich_event['assignee_login'] = event_assignee.get("login", None) + elif rich_event['event_type'] in MARKED_AS_DUPLICATE_EVENT: + duplicate = event['canonical'] + rich_event['duplicate_cross_repo'] = event['isCrossRepository'] + rich_event['duplicate_number'] = duplicate['number'] + rich_event['duplicate_url'] = duplicate['url'] + rich_event['duplicate_repo'] = '/'.join(duplicate['url'].replace(GITHUB, '').split('/')[:-2]) + rich_event['duplicate_created_at'] = duplicate['createdAt'] + rich_event['duplicate_updated_at'] = duplicate['updatedAt'] + rich_event['duplicate_closed_at'] = duplicate['closedAt'] + rich_event['duplicate_closed'] = duplicate['closed'] + rich_event['duplicate_merged'] = duplicate.get('merged', None) + elif rich_event['event_type'] in TRANSFERRED_EVENT: + from_repository = event['fromRepository'] + rich_event['from_repo_id'] = from_repository['id'] + rich_event['from_repo_url'] = from_repository['url'] + rich_event['from_repo'] = from_repository['url'].replace(GITHUB, '') + elif rich_event['event_type'] in TITLE_EVENT: + rich_event['current_title'] = event['currentTitle'] + rich_event['previous_title'] = event['previousTitle'] else: logger.warning("[github] event {} not processed".format(rich_event['event_type'])) @@ -300,6 +340,46 @@ def __get_rich_event(self, item): return rich_event + def __get_rich_stargazer(self, item): + rich_stargazer = {} + + self.copy_raw_fields(self.RAW_FIELDS_COPY, item, rich_stargazer) + + stargazer = item['data'] + rich_stargazer['user_login'] = stargazer['login'] + rich_stargazer['user_id'] = stargazer['id'] + rich_stargazer['user_name'] = stargazer.get('name', None) + rich_stargazer['auhtor_name'] = stargazer.get('name', None) + rich_stargazer['user_email'] = stargazer.get('email', None) + rich_stargazer['user_company'] = stargazer.get('company', None) + rich_stargazer['created_at'] = stargazer['createdAt'] + + if self.prjs_map: + rich_stargazer.update(self.get_item_project(rich_stargazer)) + + rich_stargazer.update(self.get_grimoire_fields(stargazer['createdAt'], "stargazer")) + + return rich_stargazer + + def __get_rich_fork(self, item): + rich_fork = {} + + self.copy_raw_fields(self.RAW_FIELDS_COPY, item, rich_fork) + + fork = item['data'] + rich_fork['frok_id'] = fork['id'] + rich_fork['frok_url'] = fork['url'] + rich_fork['user_login'] = fork['owner']['login'] + rich_fork['user_id'] = fork['owner']['id'] + rich_fork['created_at'] = fork['createdAt'] + + if self.prjs_map: + rich_fork.update(self.get_item_project(rich_fork)) + + rich_fork.update(self.get_grimoire_fields(fork['createdAt'], "fork")) + + return rich_fork + def enrich_duration_analysis(self, ocean_backend, enrich_backend, start_event_type, target_attr, fltr_event_types, fltr_attr=None, page_size=200): """The purpose of this study is to calculate the duration between two GitHub events. It requires diff --git a/grimoire_elk/raw/github.py b/grimoire_elk/raw/github.py index 687702bec..3f4a97598 100644 --- a/grimoire_elk/raw/github.py +++ b/grimoire_elk/raw/github.py @@ -74,6 +74,15 @@ def get_elastic_mappings(es_major): "body": { "type": "text", "index": true + }, + "releases": { + "dynamic":false, + "properties": { + "body": { + "type": "text", + "index": true + } + } } } }