-
Notifications
You must be signed in to change notification settings - Fork 7
99 lines (86 loc) · 3.68 KB
/
reddit_scraper.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
name: Reddit Stash Scraper
on:
schedule:
# TIMEZONE INFORMATION:
# - GitHub Actions uses UTC time for all cron schedules
# - CET is UTC+1 in winter and UTC+2 (CEST) in summer
# - To convert from your local time to UTC: subtract your UTC offset (e.g., for CEST: 14:00 CEST - 2 hours = 12:00 UTC)
# - To convert from UTC to your local time: add your UTC offset (e.g., for CEST: 12:00 UTC + 2 hours = 14:00 CEST)
#
# ADJUSTING FOR YOUR TIMEZONE:
# 1. Determine your timezone's UTC offset (examples: US Eastern = UTC-4/5, Japan = UTC+9, UK = UTC+0/1)
# 2. For desired local time X, set the UTC time to: X - (your UTC offset)
# 3. Update both cron expressions below with your calculated UTC times
# Higher frequency during peak hours (8:00-23:00 CEST = 6:00-21:00 UTC)
- cron: "0 6-21/2 * * *" # Every 2 hours during peak hours (converts to 8:00-23:00 CET time in summer)
# Lower frequency during off-peak hours
- cron: "0 23,3 * * *" # Twice during off-peak (converts to 1:00 and 5:00 CET time in summer)
workflow_dispatch: # Manual trigger option
# Prevent multiple workflow runs executing simultaneously
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
run-script:
runs-on: ubuntu-latest
timeout-minutes: 10 # Kill job if it runs longer than 10 minutes
steps:
- name: Checkout repository
uses: actions/checkout@v4
timeout-minutes: 1
# Create 6-month refresh key for dependency cache invalidation
- name: Create semi-annual cache key
id: cache-key
run: |
# Get current year and determine which half of the year we're in (1 or 2)
year=$(date +'%Y')
month=$(date +'%-m')
half_year=$((month <= 6 ? 1 : 2))
echo "date=${year}-H${half_year}" >> $GITHUB_OUTPUT
echo "Using cache key: ${year}-H${half_year}"
# Set up Python with built-in caching
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.10'
cache: 'pip'
cache-dependency-path: requirements.txt
# Install dependencies with pip
- name: Install dependencies
env:
PIP_DISABLE_PIP_VERSION_CHECK: 1
PIP_NO_WARN_SCRIPT_LOCATION: 1
PIP_ROOT_USER_ACTION: ignore
# This forces cache invalidation every 6 months
SEMI_ANNUAL_REFRESH: ${{ steps.cache-key.outputs.date }}
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
# Download only the log file from Dropbox
- name: Download log file from Dropbox
timeout-minutes: 2
env:
DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }}
DROPBOX_APP_KEY: ${{ secrets.DROPBOX_APP_KEY }}
DROPBOX_APP_SECRET: ${{ secrets.DROPBOX_APP_SECRET }}
run: |
python dropbox_utils.py --download # downloads only the log file
# Process Reddit data
- name: Run Reddit Stash Script
timeout-minutes: 3
env:
REDDIT_CLIENT_ID: ${{ secrets.REDDIT_CLIENT_ID }}
REDDIT_CLIENT_SECRET: ${{ secrets.REDDIT_CLIENT_SECRET }}
REDDIT_USERNAME: ${{ secrets.REDDIT_USERNAME }}
REDDIT_PASSWORD: ${{ secrets.REDDIT_PASSWORD }}
run: |
python reddit_stash.py # process the files
# Upload any changed files to Dropbox
- name: Upload updated data to Dropbox
timeout-minutes: 2
env:
DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }}
DROPBOX_APP_KEY: ${{ secrets.DROPBOX_APP_KEY }}
DROPBOX_APP_SECRET: ${{ secrets.DROPBOX_APP_SECRET }}
run: |
python dropbox_utils.py --upload # uploads only changed files