Skip to content

Commit

Permalink
Fix key communities script (#24)
Browse files Browse the repository at this point in the history
* Fix key communities script

* Blackify
  • Loading branch information
choldgraf authored Feb 7, 2024
1 parent bd65cca commit f15f2d2
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 32 deletions.
40 changes: 27 additions & 13 deletions book/scripts/download_github_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
all of 2i2c's team members. Store them in a local CSV file that is
used in visualization notebooks to plot activity over time.
"""

from github_activity import get_activity
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo
import pandas as pd
import numpy as np
import os
from yaml import safe_load
from tomlkit import parse
from pathlib import Path
from copy import deepcopy

Expand All @@ -20,7 +21,9 @@
here = Path(".")

# Load data that we'll use for visualization
communities = safe_load((here / "../data/key-communities.yml").read_text())
communities = parse(Path(here / "../data/key-communities.toml").read_text())[
"communities"
]
team = safe_load((here / "../data/team.yml").read_text())

# If data already exists locally, load it
Expand All @@ -47,7 +50,7 @@
if data is not None:
max_in_data = data["updatedAt"].max()
if max_in_data > time_window_begin:
time_window_begin = max_in_data
time_window_begin = max_in_data

# Uncomment this to manually define a start date
# start = datetime(2023, 1, 10, tzinfo=ZoneInfo("UTC"))
Expand All @@ -65,16 +68,18 @@
# Download latest batch of data from GitHub
for community in communities:
if time_window_end > today:
print(f"time_window_end date {time_window_end} is less than {today}, no data to update...")
print(
f"time_window_end date {time_window_end} is less than {today}, no data to update..."
)
continue

# Download the data from GitHub using github_activity
# We do this in windows of 2 months and then concat in one DataFrame
data_new = []
for ii in range(1, len(time_breakpoints)):
start_time = time_breakpoints[ii-1]
start_time = time_breakpoints[ii - 1]
stop_time = time_breakpoints[ii]

# Check for GitHub api authentication tokens and raise an error if none exist
auth_keys = ["TOKEN_GITHUB_READONLY", "GITHUB_TOKEN"]
for key in auth_keys:
Expand All @@ -83,13 +88,21 @@
break
auth = None
if auth is None:
print("No GitHub authentication token found, you will hit the rate limit...")
print(
"No GitHub authentication token found, you will hit the rate limit..."
)
print(f"Searched for these key names: {auth_keys}")

print(f"Downloading activity in {community} from {start_time:%Y-%m-%d} to {stop_time:%Y-%m-%d}")
data_new.append(get_activity(community, f"{start_time:%Y-%m-%d}", f"{stop_time:%Y-%m-%d}", auth=auth))
print(
f"Downloading activity in {community} from {start_time:%Y-%m-%d} to {stop_time:%Y-%m-%d}"
)
data_new.append(
get_activity(
community, f"{start_time:%Y-%m-%d}", f"{stop_time:%Y-%m-%d}", auth=auth
)
)
data_new = pd.concat(data_new)

# Clean up some fields so they're easier to work with later
def _extract_node(item):
"""Extract any data that is nested in GraphQL sections."""
Expand All @@ -106,15 +119,16 @@ def _extract_node(item):
return item["oid"]
else:
return item

data_new = data_new.applymap(_extract_node)

# Extract values from a few special-case columns
data_new["mergedBy"]

# Change datetime strings to objects
for col in datetime_columns:
data_new[col] = pd.to_datetime(data_new[col])

# Save our final data or append it to pre-existing data
if data is None:
data = data_new
Expand Down
31 changes: 19 additions & 12 deletions book/scripts/download_hub_activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
ref: https://github.com/2i2c-org/infrastructure/tree/master/config/clusters
"""

from rich import print
from rich.progress import Progress
import pandas as pd
Expand All @@ -24,10 +25,12 @@

# Download the `infrastructure/` repository as a Zip file so we can inspect contents
# For now we don't use a token because this *should* only be a single operation.
URL_REPOSITORY_ZIP = "https://github.com/2i2c-org/infrastructure/archive/refs/heads/master.zip"
URL_REPOSITORY_ZIP = (
"https://github.com/2i2c-org/infrastructure/archive/refs/heads/master.zip"
)
with urlopen(URL_REPOSITORY_ZIP) as zipresp:
with ZipFile(BytesIO(zipresp.read())) as zfile:
zfile.extractall('./_build/data/')
zfile.extractall("./_build/data/")

# These are the time scales that we know are in the hub's metrics
search_time_scales = {"24h": "Daily", "7d": "Weekly", "30d": "Monthly"}
Expand All @@ -45,7 +48,9 @@
if not cluster_yaml.exists():
print(f"Skipping folder {cluster} because no cluster.yaml file exists...")
continue
progress.update(p_clusters, description=f"Processing cluster {cluster.split('/')[-1]}...")
progress.update(
p_clusters, description=f"Processing cluster {cluster.split('/')[-1]}..."
)
config = cluster_yaml.read_text()
config = safe_load(config)

Expand All @@ -70,21 +75,23 @@
for iline in metrics:
if "jupyterhub_active_users" not in iline:
continue

# We expect three time scales per hub
for scale, name in search_time_scales.items():
if scale in iline:
users = int(float(iline.split()[-1]))
df.append({
"cluster": cluster.split("/")[-1],
"hub": hub["domain"],
"scale": name,
"users": users,
"chart": hub["helm_chart"]
})
df.append(
{
"cluster": cluster.split("/")[-1],
"hub": hub["domain"],
"scale": name,
"users": users,
"chart": hub["helm_chart"],
}
)
progress.update(p_hubs, advance=1)
progress.update(p_clusters, advance=1)

# Convert to a to save as a CSV
df = pd.DataFrame(df)
path_out = Path(__file__).parent / ".." / "data" / "hub-activity.csv"
Expand Down
12 changes: 5 additions & 7 deletions book/scripts/munge_css_accounting_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@
# ## Instructions
#
# - Download latest data from our [Accounting Statements Folder](https://docs.google.com/spreadsheets/d/1PDpPAed_q35n1-xSNN1U9tzZg7tzzBVfpmGYFqMOneQ/edit?usp=share_link)
#
#
# > For example, here's [the account transactions up to 2023/01/31](https://docs.google.com/spreadsheets/d/1PDpPAed_q35n1-xSNN1U9tzZg7tzzBVfpmGYFqMOneQ/edit#gid=686580753)).
#
#
# - Move to the `_data` folder here
# - Run this notebook
# - The final cell will copy the munged data to your clipboard
Expand Down Expand Up @@ -51,7 +51,7 @@
for irow in category_rows["Date"].values:
key, val = irow.split(" ", 1)
category_mapping[int(key)] = val

# Remove the "summary" lines from our account codes
data = data.dropna(subset=["Source"])

Expand All @@ -65,7 +65,7 @@
inet = f"-{inet}"
for ichar in [",", "(", ")"]:
inet = inet.replace(ichar, "")

# Make inet a float to make sure it's possible
inet = float(inet)

Expand All @@ -77,7 +77,7 @@
else:
data.loc[ix, "Cost"] = inet
data.loc[ix, "Revenue"] = 0

data = data.drop(columns=["Net"])

# + [markdown] user_expressions=[]
Expand All @@ -94,5 +94,3 @@
# -

data


0 comments on commit f15f2d2

Please sign in to comment.