Skip to content

Commit db97548

Browse files
author
Jackson Maxfield Brown
authoredJun 15, 2022
feature/split-indexing-into-multi-step-workflow (#108)
* Fan out index upload to single file chunks (#1) * Install branch of cdp-backend * Fast fail false * Fix fast fail indent * Working event index * Upgrade frontend
1 parent f26e4bb commit db97548

File tree

1 file changed

+100
-12
lines changed

1 file changed

+100
-12
lines changed
 
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
name: Event Index
22

33
on:
4-
push:
5-
branches:
6-
- main
4+
workflow_dispatch:
5+
76
schedule:
87
# <minute [0,59]> <hour [0,23]> <day of the month [1,31]> <month of the year [1,12]> <day of the week [0,6]>
98
# https://pubs.opengroup.org/onlinepubs/9699919799/utilities/crontab.html#tag_20_25_07
@@ -12,39 +11,128 @@ on:
1211
# We offset from the hour and half hour to go easy on the servers :)
1312
- cron: '26 3 * * 4'
1413

14+
# We doubly fan out
15+
# We first generate indexs for uni, bi, and trigrams with a matrix
16+
# Each index is split into chunks of 50,000 grams
17+
# Then we fan out by every chunk and upload
18+
{% raw %}
1519
jobs:
16-
index-events:
20+
generate-index-chunks:
1721
runs-on: ubuntu-latest
1822
strategy:
1923
matrix:
20-
# We fan out on n-gram to make it possible to run on GitHub Actions
2124
n-gram: [1, 2, 3]
25+
fail-fast: false
26+
27+
outputs:
28+
ngram-1-chunks: ${{ steps.output-index-chunks.outputs.ngram-1-chunks }}
29+
ngram-2-chunks: ${{ steps.output-index-chunks.outputs.ngram-2-chunks }}
30+
ngram-3-chunks: ${{ steps.output-index-chunks.outputs.ngram-3-chunks }}
2231

2332
steps:
33+
# Setup Runner
2434
- uses: actions/checkout@v2
2535
- uses: actions/setup-python@v1
2636
with:
2737
python-version: 3.9
2838

39+
# Setup GCloud / Creds
2940
- name: Setup gcloud
3041
uses: google-github-actions/setup-gcloud@v0
3142
with:
32-
project_id: {{ cookiecutter.infrastructure_slug }}
33-
service_account_key: {% raw %}${{ secrets.GOOGLE_CREDENTIALS }}{% endraw %}
43+
project_id: {% endraw %}{{ cookiecutter.infrastructure_slug }}{% raw %}
44+
service_account_key: ${{ secrets.GOOGLE_CREDENTIALS }}
3445
export_default_credentials: true
46+
- name: Dump Credentials to JSON
47+
run: |
48+
echo "$GOOGLE_CREDS" > python/google-creds.json
49+
env:
50+
GOOGLE_CREDS: ${{ secrets.GOOGLE_CREDENTIALS }}
3551

52+
# Installs
3653
- name: Install Python Dependencies
3754
run: |
3855
cd python/
3956
pip install .
57+
58+
# Index
59+
- name: Index Events ${{ matrix.n-gram }}-grams
60+
run: |
61+
cd python/
62+
run_cdp_event_index_generation event-index-config.json \
63+
--n_grams ${{ matrix.n-gram }} \
64+
--store_remote \
65+
--parallel
66+
67+
# Store generated files to step output
68+
- name: Store Index Fileset to Outputs
69+
id: output-index-chunks
70+
run: |
71+
cd python/index/
72+
output=$(python -c 'import os, json; print(json.dumps(os.listdir(".")))')
73+
echo "::set-output name=ngram-${{ matrix.n-gram }}-chunks::$output"
74+
75+
combine-matrix-ngram-chunks:
76+
needs: generate-index-chunks
77+
runs-on: ubuntu-latest
78+
outputs:
79+
all-chunks: ${{ steps.combine-index-chunks.outputs.combined-chunks }}
80+
81+
steps:
82+
# Setup Runner
83+
- uses: actions/checkout@v2
84+
- uses: actions/setup-python@v1
85+
with:
86+
python-version: 3.9
87+
88+
# Process
89+
- name: Combine Chunks
90+
id: 'combine-index-chunks'
91+
run: |
92+
echo 'print(${{ needs.generate-index-chunks.outputs.ngram-1-chunks }} + ${{ needs.generate-index-chunks.outputs.ngram-2-chunks }} + ${{ needs.generate-index-chunks.outputs.ngram-3-chunks }})' >> print-combined-chunks.py
93+
output=$(python print-combined-chunks.py)
94+
echo "::set-output name=combined-chunks::$output"
95+
96+
upload-index-chunks:
97+
needs: combine-matrix-ngram-chunks
98+
runs-on: ubuntu-latest
99+
strategy:
100+
max-parallel: 6
101+
matrix:
102+
filename: ${{ fromJson(needs.combine-matrix-ngram-chunks.outputs.all-chunks) }}
103+
fail-fast: false
104+
105+
steps:
106+
# Setup Runner
107+
- uses: actions/checkout@v2
108+
- uses: actions/setup-python@v1
109+
with:
110+
python-version: 3.9
111+
112+
# Setup GCloud / Creds
113+
- name: Setup gcloud
114+
uses: google-github-actions/setup-gcloud@v0
115+
with:
116+
project_id: {% endraw $}{{ cookiecutter.infrastructure_slug }}{$ raw $}
117+
service_account_key: ${{ secrets.GOOGLE_CREDENTIALS }}
118+
export_default_credentials: true
40119
- name: Dump Credentials to JSON
41120
run: |
42121
echo "$GOOGLE_CREDS" > python/google-creds.json
43122
env:
44-
GOOGLE_CREDS: {% raw %}${{ secrets.GOOGLE_CREDENTIALS }}{% endraw %}
45-
- name: Index Events {% raw %}${{ matrix.n-gram }}{% endraw %}-grams
123+
GOOGLE_CREDS: ${{ secrets.GOOGLE_CREDENTIALS }}
124+
125+
# Installs
126+
- name: Install Python Dependencies
46127
run: |
47128
cd python/
48-
{% raw %}run_cdp_event_index event-index-config.json \
49-
--n_grams ${{ matrix.n-gram }} \
50-
--parallel{% endraw %}
129+
pip install .
130+
131+
# Upload Index Chunk
132+
- name: Process Upload
133+
run: |
134+
cd python/
135+
process_cdp_event_index_chunk event-index-config.json \
136+
${{ matrix.filename }} \
137+
--parallel
138+
{% endraw %}

0 commit comments

Comments
 (0)
Please sign in to comment.