diff --git a/book/scripts/download_github_data.py b/book/scripts/download_github_data.py index 9a0f22c..282892e 100644 --- a/book/scripts/download_github_data.py +++ b/book/scripts/download_github_data.py @@ -3,13 +3,14 @@ all of 2i2c's team members. Store them in a local CSV file that is used in visualization notebooks to plot activity over time. """ + from github_activity import get_activity from datetime import datetime, timedelta from zoneinfo import ZoneInfo import pandas as pd -import numpy as np import os from yaml import safe_load +from tomlkit import parse from pathlib import Path from copy import deepcopy @@ -20,7 +21,9 @@ here = Path(".") # Load data that we'll use for visualization -communities = safe_load((here / "../data/key-communities.yml").read_text()) +communities = parse(Path(here / "../data/key-communities.toml").read_text())[ + "communities" +] team = safe_load((here / "../data/team.yml").read_text()) # If data already exists locally, load it @@ -47,7 +50,7 @@ if data is not None: max_in_data = data["updatedAt"].max() if max_in_data > time_window_begin: - time_window_begin = max_in_data + time_window_begin = max_in_data # Uncomment this to manually define a start date # start = datetime(2023, 1, 10, tzinfo=ZoneInfo("UTC")) @@ -65,16 +68,18 @@ # Download latest batch of data from GitHub for community in communities: if time_window_end > today: - print(f"time_window_end date {time_window_end} is less than {today}, no data to update...") + print( + f"time_window_end date {time_window_end} is less than {today}, no data to update..." + ) continue # Download the data from GitHub using github_activity # We do this in windows of 2 months and then concat in one DataFrame data_new = [] for ii in range(1, len(time_breakpoints)): - start_time = time_breakpoints[ii-1] + start_time = time_breakpoints[ii - 1] stop_time = time_breakpoints[ii] - + # Check for GitHub api authentication tokens and raise an error if none exist auth_keys = ["TOKEN_GITHUB_READONLY", "GITHUB_TOKEN"] for key in auth_keys: @@ -83,13 +88,21 @@ break auth = None if auth is None: - print("No GitHub authentication token found, you will hit the rate limit...") + print( + "No GitHub authentication token found, you will hit the rate limit..." + ) print(f"Searched for these key names: {auth_keys}") - print(f"Downloading activity in {community} from {start_time:%Y-%m-%d} to {stop_time:%Y-%m-%d}") - data_new.append(get_activity(community, f"{start_time:%Y-%m-%d}", f"{stop_time:%Y-%m-%d}", auth=auth)) + print( + f"Downloading activity in {community} from {start_time:%Y-%m-%d} to {stop_time:%Y-%m-%d}" + ) + data_new.append( + get_activity( + community, f"{start_time:%Y-%m-%d}", f"{stop_time:%Y-%m-%d}", auth=auth + ) + ) data_new = pd.concat(data_new) - + # Clean up some fields so they're easier to work with later def _extract_node(item): """Extract any data that is nested in GraphQL sections.""" @@ -106,15 +119,16 @@ def _extract_node(item): return item["oid"] else: return item + data_new = data_new.applymap(_extract_node) - + # Extract values from a few special-case columns data_new["mergedBy"] - + # Change datetime strings to objects for col in datetime_columns: data_new[col] = pd.to_datetime(data_new[col]) - + # Save our final data or append it to pre-existing data if data is None: data = data_new diff --git a/book/scripts/download_hub_activity.py b/book/scripts/download_hub_activity.py index 3aa4dad..baeda49 100644 --- a/book/scripts/download_hub_activity.py +++ b/book/scripts/download_hub_activity.py @@ -5,6 +5,7 @@ ref: https://github.com/2i2c-org/infrastructure/tree/master/config/clusters """ + from rich import print from rich.progress import Progress import pandas as pd @@ -24,10 +25,12 @@ # Download the `infrastructure/` repository as a Zip file so we can inspect contents # For now we don't use a token because this *should* only be a single operation. -URL_REPOSITORY_ZIP = "https://github.com/2i2c-org/infrastructure/archive/refs/heads/master.zip" +URL_REPOSITORY_ZIP = ( + "https://github.com/2i2c-org/infrastructure/archive/refs/heads/master.zip" +) with urlopen(URL_REPOSITORY_ZIP) as zipresp: with ZipFile(BytesIO(zipresp.read())) as zfile: - zfile.extractall('./_build/data/') + zfile.extractall("./_build/data/") # These are the time scales that we know are in the hub's metrics search_time_scales = {"24h": "Daily", "7d": "Weekly", "30d": "Monthly"} @@ -45,7 +48,9 @@ if not cluster_yaml.exists(): print(f"Skipping folder {cluster} because no cluster.yaml file exists...") continue - progress.update(p_clusters, description=f"Processing cluster {cluster.split('/')[-1]}...") + progress.update( + p_clusters, description=f"Processing cluster {cluster.split('/')[-1]}..." + ) config = cluster_yaml.read_text() config = safe_load(config) @@ -70,21 +75,23 @@ for iline in metrics: if "jupyterhub_active_users" not in iline: continue - + # We expect three time scales per hub for scale, name in search_time_scales.items(): if scale in iline: users = int(float(iline.split()[-1])) - df.append({ - "cluster": cluster.split("/")[-1], - "hub": hub["domain"], - "scale": name, - "users": users, - "chart": hub["helm_chart"] - }) + df.append( + { + "cluster": cluster.split("/")[-1], + "hub": hub["domain"], + "scale": name, + "users": users, + "chart": hub["helm_chart"], + } + ) progress.update(p_hubs, advance=1) progress.update(p_clusters, advance=1) - + # Convert to a to save as a CSV df = pd.DataFrame(df) path_out = Path(__file__).parent / ".." / "data" / "hub-activity.csv" diff --git a/book/scripts/munge_css_accounting_data.py b/book/scripts/munge_css_accounting_data.py index 05ef144..8ba422b 100644 --- a/book/scripts/munge_css_accounting_data.py +++ b/book/scripts/munge_css_accounting_data.py @@ -18,9 +18,9 @@ # ## Instructions # # - Download latest data from our [Accounting Statements Folder](https://docs.google.com/spreadsheets/d/1PDpPAed_q35n1-xSNN1U9tzZg7tzzBVfpmGYFqMOneQ/edit?usp=share_link) -# +# # > For example, here's [the account transactions up to 2023/01/31](https://docs.google.com/spreadsheets/d/1PDpPAed_q35n1-xSNN1U9tzZg7tzzBVfpmGYFqMOneQ/edit#gid=686580753)). -# +# # - Move to the `_data` folder here # - Run this notebook # - The final cell will copy the munged data to your clipboard @@ -51,7 +51,7 @@ for irow in category_rows["Date"].values: key, val = irow.split(" ", 1) category_mapping[int(key)] = val - + # Remove the "summary" lines from our account codes data = data.dropna(subset=["Source"]) @@ -65,7 +65,7 @@ inet = f"-{inet}" for ichar in [",", "(", ")"]: inet = inet.replace(ichar, "") - + # Make inet a float to make sure it's possible inet = float(inet) @@ -77,7 +77,7 @@ else: data.loc[ix, "Cost"] = inet data.loc[ix, "Revenue"] = 0 - + data = data.drop(columns=["Net"]) # + [markdown] user_expressions=[] @@ -94,5 +94,3 @@ # - data - -