Skip to content

Commit

Permalink
[script] Add script to help generate the Changelog (#23744)
Browse files Browse the repository at this point in the history
## Summary & Motivation

As title. This adds a script to read through the commits since the
previous release and create changelog entries for them. The expected
workflow is for PR authors to add their changelog entries in their
commit messages, i.e.:

CHANGELOG:
Added internal script to generate changelog entries.

In the case that your PR does not actually require any acknowledgement
in the changelog (such as this one, as it is an internal script), you
can indicate that by adding the NOCHANGELOG string anywhere in your
commit message.

If neither of these strings are found, then the commit is considered
"Undocumented", and manual intervention will be required. We can trial
this workflow for a few weeks before adding CI hooks to enforce that you
add changelog text to your commits.

To generate a new changelog, you can run:

```
python scripts/create_changelog.py 1.8.2 | pbcopy
```

Which you can then paste into your collaborative docs editor of choice
(note, once we use CI to enforce CL entries, we can cut this manual step
out entirely).


## How I Tested These Changes

Check out this sample changelog doc generated by the script:
https://www.notion.so/dagster/7dd240b804b5439c8e65725d24ac0f29?showMoveTo=true&saveParent=true
  • Loading branch information
OwenKephart authored and sryza committed Aug 24, 2024
1 parent 167facc commit 1c060e7
Show file tree
Hide file tree
Showing 2 changed files with 169 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## Summary & Motivation

## How I Tested These Changes

## Changelog [New | Bug | Docs]

> Replace this message with a changelog entry, or `NOCHANGELOG`
165 changes: 165 additions & 0 deletions scripts/generate_changelog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import os
import re
import sys
from collections import defaultdict
from typing import Iterator, List, Mapping, NamedTuple, Optional, Sequence

import click
import git
from path import Path

GITHUB_URL = "https://github.com/dagster-io/dagster"
OSS_REPO = git.Repo(Path(__file__).parent.parent)
INTERNAL_REPO = git.Repo(os.environ["DAGSTER_INTERNAL_GIT_REPO_DIR"])

CHANGELOG_HEADER_PATTERN = re.compile(r"## CHANGELOG.*\[\s*(.*?)\s*\]")
IGNORE_TOKEN = "NOCHANGELOG"

CATEGORIES = {
"NEW": "New",
"BUG": "Bugfixes",
"DOCS": "Documentation",
"BREAKING": "Breaking Changes",
"DEPRECATE": "Deprecations",
"PLUS": "Dagster Plus",
None: "Invalid",
}


class ParsedCommit(NamedTuple):
issue_link: str
changelog_category: str
raw_changelog_entry: Optional[str]
raw_title: str
author: str
repo_name: str

@property
def documented(self) -> bool:
return bool(self.raw_changelog_entry)


def _get_previous_version(new_version: str) -> str:
split = new_version.split(".")
previous_patch = int(split[-1]) - 1
assert previous_patch >= 0, "Must explicitly set `previous_version` on major releases."
return ".".join([*split[:-1], str(previous_patch)])


def _get_libraries_version(new_version: str) -> str:
split = new_version.split(".")
new_minor = int(split[1]) + 16
return ".".join([split[0], str(new_minor), split[2]])


def _get_parsed_commit(commit: git.Commit) -> ParsedCommit:
"""Extracts a set of useful information from the raw commit message."""
title = str(commit.message).splitlines()[0]
# me avoiding regex -- titles are formatted as "Lorem ipsum ... (#12345)" so we can just search
# for the last octothorpe and chop off the closing paren
issue_number = title.split("#")[-1][:-1]
issue_link = f"[#{issue_number}]({GITHUB_URL}/pull/{issue_number})"

# find the first line that has `CHANGELOG` in the first few characters, then take the next
# non-empty line
found = False
changelog_category = "Invalid"
raw_changelog_entry = None
for line in str(commit.message).split():
if found and line:
raw_changelog_entry = line
break
# give a buffer to allow us to match formats such as "## Changelog"
match = CHANGELOG_HEADER_PATTERN.match(line)
if match:
changelog_category = CATEGORIES.get(match.group(1), changelog_category)
found = True

return ParsedCommit(
issue_link=issue_link,
changelog_category=changelog_category,
raw_changelog_entry=raw_changelog_entry,
raw_title=title,
author=str(commit.author.name),
repo_name=str(commit.repo.git_dir).split("/")[-2],
)


def _normalize(name: str) -> str:
return name.replace(" ", "").lower()


def _is_external_commit(commit: git.Commit) -> bool:
# not super accurate at the moment, we'll probably need to actually ping the Github API
return bool(commit.co_authors) and any(
_normalize(str(a.name)) != _normalize(str(commit.author.name)) for a in commit.co_authors
)


def _get_documented_section(documented: Sequence[ParsedCommit]) -> str:
grouped_commits: Mapping[str, List[ParsedCommit]] = defaultdict(list)
for commit in documented:
grouped_commits[commit.author].append(commit)

documented_text = ""
for category in CATEGORIES.values():
documented_text += f"### {category}\n\n"
for commit in grouped_commits.get(category, []):
documented_text += f"\n* {commit.issue_link} {commit.raw_changelog_entry}"
return documented_text


def _get_undocumented_section(undocumented: Sequence[ParsedCommit]) -> str:
undocumented_text = "# Undocumented Changes"

grouped_commits: Mapping[str, List[ParsedCommit]] = defaultdict(list)
for commit in undocumented:
grouped_commits[commit.author].append(commit)

for author, commits in sorted(grouped_commits.items()):
undocumented_text += f"\n- [ ] {author}"
for commit in commits:
undocumented_text += (
f"\n\t- [ ] (repo:{commit.repo_name}) {commit.issue_link} {commit.raw_title}"
)
return undocumented_text


def _get_commits(
repos: Sequence[git.Repo], new_version: str, prev_version: str
) -> Iterator[ParsedCommit]:
for repo in repos:
for commit in repo.iter_commits(rev=f"release-{prev_version}..release-{new_version}"):
if IGNORE_TOKEN in str(commit.message):
continue

yield _get_parsed_commit(commit)


def _generate_changelog(new_version: str, prev_version: str) -> None:
documented: List[ParsedCommit] = []
undocumented: List[ParsedCommit] = []

for commit in _get_commits([OSS_REPO, INTERNAL_REPO], new_version, prev_version):
if commit.documented:
documented.append(commit)
else:
undocumented.append(commit)

header = f"# Changelog {new_version}\n\n## {new_version} (core) / {_get_libraries_version(new_version)} (libraries)\n\n"
sys.stdout.write(
f"{header}\n{_get_documented_section(documented)}\n{_get_undocumented_section(undocumented)}"
)


@click.command()
@click.argument("new_version", type=str, required=True)
@click.argument("prev_version", type=str, required=False)
def generate_changelog(new_version: str, prev_version: Optional[str] = None) -> None:
if prev_version is None:
prev_version = _get_previous_version(new_version)
_generate_changelog(new_version, prev_version)


if __name__ == "__main__":
generate_changelog()

0 comments on commit 1c060e7

Please sign in to comment.