-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_developers.py
56 lines (43 loc) · 1.65 KB
/
extract_developers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from csv import QUOTE_ALL
from pandas import DataFrame, concat, read_csv
from common import cleanup, initialize, logger, paths, preprocessed, refresh
initialize()
log = logger(__file__)
def import_timelines(project):
return read_csv(
paths("timelines_preprocessed", project), usecols=["actor", "author.name", "author.email"], quoting=QUOTE_ALL
).dropna()
def export_developers(developers):
DataFrame.from_dict(developers, orient="index").sort_index().rename_axis("actor").to_csv(
paths("developers"), quoting=QUOTE_ALL
)
def extract_developers():
log.info("Extracting developers")
events = concat([import_timelines(project) for project in preprocessed()]).astype(
{"actor": "category", "author.name": "category", "author.email": "category"}
)
events = events.drop(
events.query(
"not `author.email`.str.contains('@', regex=False)"
" or `author.email`.str.contains('noreply|no-reply', regex=True)"
).index
)
developers = {}
for actor in [actor for actor in events["actor"].unique() if actor != "ghost"]:
actor_events = events.query("actor == @actor")
developers[actor] = {
"name": ", ".join(sorted(actor_events["author.name"].unique())),
"email": ", ".join(sorted(actor_events["author.email"].unique())),
}
export_developers(developers)
def main():
if cleanup("developers", refresh()):
extract_developers()
else:
print("Skip extracting developers")
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print("Stop extracting developers")
exit(1)