Skip to content


finished analysis script docs
Browse files Browse the repository at this point in the history
  • Loading branch information
karacolada committed Jul 5, 2024
1 parent 7e9f180 commit c2fb143
Show file tree
Hide file tree
Showing 3 changed files with 175 additions and 24 deletions.
4 changes: 2 additions & 2 deletions src/analysis/
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ The scripts in this directory were used to produce derived data and plots.
- [``](./ aggregates all data mined from GitHub into four datasets described in the wiki. Crucially, the data is reshaped into a time-indexed format for three of those datasets.
- [``](./ visualises the relationship between how a repository is cited and the difference between its creation date and the publication date.
- [``](./ creates a dataset with all repositories mined from ePrints for which we manually determined the citation type. The resulting dataset contains data from ePrints as well as a label indicating whether the software was cited as created software.
- [``](./
- [``](./
- [``](./ creates one plot containing visualisations and data about all repositories. The dataset can be filtered for a subset of repositories with the `--filter` argument.
- [``](./ creates one plot for one repository, focussing on timelined data. The code to produce these uses the raw data rather than the aggregated data produced by [``](./ as this script was written before [``](./ Both scripts use the same data manipulation methods - directly plotting data produced by [``](./ should result in similar graphs.

The schemas for any produced datasets are included in the wiki.
85 changes: 75 additions & 10 deletions src/analysis/
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,42 @@ def info(verbose, msg):
print(f"[INFO] {msg}")

def clean_heading(h):
"""Clean and normalise extracted headings.
h (str): heading text
str: cleaned heading
# remove leading non-word characters
to_remove = string.digits + string.whitespace + ".:"
h = h.lstrip(to_remove)
# remove markdown-style links
pattern = "\[(.+?)\]\(.+?\)"
h = re.sub(pattern, r'\1', h, count=0)
# remove any punctuation and convert to lower-case
h = h.replace(string.punctuation, "")
h = h.strip(string.punctuation)
h = h.lower()
return h

def plot_license_type(contents, ax):
"""Plot a bar chart indicating the number of repositories with permissive, non-permissive, unknown type license or no license at all.
contents (pd.DataFrame): contents data mined from GitHub
ax (Axes): subplot to use
contents = contents.copy()
permissive_licenses = ["mit", "gpl-3.0", "apache-2.0", "bsd-3-clause", "gpl-2.0", "bsd-2-clause"] #
contents.license = contents.license.fillna('None')
# If not permissive, check if it's non-existent or type other, otherwise class as non-permissive
contents["license_type"] = np.where(
contents.license.isin(permissive_licenses), "permissive", np.where(
contents.license == "None", "None", np.where(
contents.license == "other", "unknown", "non-permissive")))
# plot value counts
Expand All @@ -41,6 +60,12 @@ def plot_license_type(contents, ax):
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

def plot_contributing_file_present(contents, ax):
"""Plot a bar chart visualising the number of repositories with contribution guidelines.
contents (pd.DataFrame): contents data mined from GitHub
ax (Axes): subplot to use
Expand All @@ -51,6 +76,12 @@ def plot_contributing_file_present(contents, ax):
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

def plot_emojis(contents, ax):
"""Plot a histogram visualising the number of emojis found in repository READMEs.
contents (pd.DataFrame): contents data mined from GitHub
ax (Axes): subplot to use
bins = [0, 1, 2, 5, 10]
if contents.readme_emojis.max() > bins[-1]:
Expand All @@ -62,15 +93,24 @@ def plot_emojis(contents, ax):
ax.set(xlabel="number of emojis in README", ylabel="repository count")

def plot_team_size(metadata, contributions, ax):
"""Plot a histogram visualising the maximum team size for a repository.
metadata (pd.DataFrame): metadata mined from GitHub
contributions (pd.DataFrame): contributions (i.e. commit) data mined from GitHub
ax (Axes): subplot to use
contrib_df = pd.merge(metadata[["github_user_cleaned_url", "created_at"]], contributions)
# add week timeline info
contrib_df["week_since_repo_creation"] = (contrib_df.week_co - contrib_df.created_at).dt.days // 7
team_df = contrib_df[["github_user_cleaned_url", "author", "week_since_repo_creation", "commits"]].set_index(["github_user_cleaned_url", "author", "week_since_repo_creation"]).sort_index()
# user is active contributor if made at least one commit in last 12 weeks
# user is considered an active contributor if they made at least one commit in the last 12 weeks
windowed_team_df = team_df.groupby(level="author").rolling(window=12, min_periods=0).sum().droplevel(0)
windowed_team_df["active contributors"] = windowed_team_df.commits > 0
# team size
# team size: number of active contributors within one week
team_size = windowed_team_df.groupby(level=["github_user_cleaned_url", "week_since_repo_creation"])["active contributors"].value_counts()[:,:,True]
max_team_size = team_size.groupby(level="github_user_cleaned_url").max()
# plot histogram
bins = [1, 2, 5, 10]
if max_team_size.max() > bins[-1]:
Expand All @@ -82,6 +122,13 @@ def plot_team_size(metadata, contributions, ax):
ax.set(xlabel="maximum team size", ylabel="repository count")

def plot_readme_size(contents, ax, type="bar"):
"""Plot a histogram of the size of the README file found in repositories. The bin limits were chosen empirically.
contents (pd.DataFrame): contents data mined from GitHub
ax (Axes): subplot to use
type (str, optional): plot type, can be "bar" or "pie". Defaults to "bar".
bins = [0, 1, 300, 1500, 10000]
binmeanings = ["none", "ultra-short", "short", "informative", "detailed"]
if contents.readme_size.max() > bins[-1]:
Expand All @@ -99,14 +146,23 @@ def plot_readme_size(contents, ax, type="bar"):
ax.set(xlabel="size of README in Bytes")

def plot_headings(readme_df, ax):
"""Plot a wordcloud from the headings used in README files. Excludes some manually defined words that skew the results too much to be meaningful.
readme_df (pd.DataFrame): readme history data mined from GitHub, including all headings ever added to the README
ax (Axes): subplot to use
# clean any existing headings
headings = []
for l in readme_df.added_headings.dropna():
headings += ast.literal_eval(l)
headings = [clean_heading(h) for h in headings]

# manually exclude words that were found to skew the distribution
stopwords = STOPWORDS
custom = set(["trades", "glosat", "glosat_table_dataset", "nilmtk", "bert", "lemon", "cascadetabnet"])
stopwords = stopwords.union(custom)
# plot wordcloud
wordcloud = WordCloud(
Expand All @@ -119,6 +175,14 @@ def plot_headings(readme_df, ax):
ax.set(title="README headings")

def plot_table(metadata, stars, forks, ax):
"""Add a table with basic stats (repository age, fork counts, star counts).
metadata (pd.DataFrame): metadata mined from GitHub.
stars (pd.DataFrame): stars data mined from GitHub.
forks (pd.DataFrame): forks data mined from GitHub.
ax (Axes): subplot to use
age = ( - metadata["created_at"]).dt.days // 7
fork_counts = forks.groupby("github_user_cleaned_url")["user"].count()
fork_counts.rename("forks_no", inplace=True)
Expand All @@ -138,7 +202,7 @@ def plot_table(metadata, stars, forks, ax):

def main(data_dir, verbose, filter_path, tag):
def main(data_dir, outdir, verbose, filter_path, tag):
info(verbose, "Loading data...")
contents = pd.read_csv(os.path.join(data_dir, "contents.csv"), index_col=0)
metadata = pd.read_csv(os.path.join(data_dir, "metadata.csv"), index_col=0)
Expand All @@ -149,7 +213,7 @@ def main(data_dir, verbose, filter_path, tag):
stars = pd.read_csv(os.path.join(data_dir, "stars.csv"), index_col=0)
forks = pd.read_csv(os.path.join(data_dir, "forks.csv"), index_col=0)

if filter_path is not None:
if filter_path is not None: # e.g. filter for high-interest repositories based on a txt file containing a list of those
info(verbose, "Filtering data...")
with open(filter_path, "r") as f:
filtered = [line.rstrip() for line in f]
Expand Down Expand Up @@ -179,19 +243,20 @@ def main(data_dir, verbose, filter_path, tag):
plot_table(metadata, stars, forks, ax7)
if tag:
plt.suptitle(f"Overall statistics for ePrints repositories ({tag})")
plt.savefig(os.path.join(data_dir, "overall", f"overall_{tag}.png"), bbox_inches="tight")
plt.savefig(os.path.join(outdir, "overall", f"overall_{tag}.png"), bbox_inches="tight")
plt.suptitle("Overall statistics for ePrints repositories")
plt.savefig(os.path.join(data_dir, "overall", "overall.png"), bbox_inches="tight")
plt.savefig(os.path.join(outdir, "overall", "overall.png"), bbox_inches="tight")

if __name__=="__main__":
parser = argparse.ArgumentParser(
description="Plot overall repo analysis."
parser.add_argument("--dir", default="../data/analysis", type=str, help="path to data directory")
parser.add_argument("--datadir", default="../../data/raw/github", type=str, help="path to GitHub data directory")
parser.add_argument("--outdir", default="../../data/derived", type=str, help="path to output data directory")
parser.add_argument("-v", "--verbose", action="store_true", help="enable verbose output")
parser.add_argument("--filter", type=str, help="path to file with repos to consider")
parser.add_argument("--tag", type=str, help="tag name to use")
parser.add_argument("--filter", type=str, help="path to file listing the repos that should be considered")
parser.add_argument("--tag", type=str, help="tag to add to the filename, e.g. to indicate that the repositories were filtered")
args = parser.parse_args()
main(args.dir, args.verbose, args.filter, args.tag)
main(args.datadir, args.outdir, args.verbose, args.filter, args.tag)

0 comments on commit c2fb143

Please sign in to comment.