Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft: Cohorts model #246

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 34 additions & 14 deletions data/transform/models/marts/telemetry/base/projects.sql
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
WITH active_projects AS (

SELECT structured_events.project_id
WITH active_events AS (
SELECT
structured_events.project_id,
structured_events.event_created_at,
structured_events.event_count
FROM {{ ref('structured_events') }}
LEFT JOIN
{{ ref('cmd_parsed_all') }} ON
Expand All @@ -13,25 +15,43 @@ WITH active_projects AS (
'meltano test',
'meltano schedule run'
)
AND structured_events.event_created_at >= DATEADD(
'month', -1, CURRENT_DATE()
)
),

active_projects AS (

SELECT
project_id,
DATE_TRUNC('month', event_created_at) AS month_start,
SUM(event_count) AS active_events_count
FROM active_events
GROUP BY 1, 2

),

active_profile AS (

SELECT
structured_events.project_id,
MIN(structured_events.event_created_at) AS activation_at,
MAX(structured_events.event_created_at) AS last_active_at
FROM {{ ref('structured_events') }}
INNER JOIN active_projects
ON structured_events.project_id = active_projects.project_id
AND DATE_TRUNC(
'month', structured_events.event_created_at
) = active_projects.month_start
WHERE active_projects.active_events_count > 1
GROUP BY 1
HAVING SUM(structured_events.event_count) > 1

)

SELECT
structured_events.project_id,
MAX(
CASE WHEN active_projects.project_id IS NOT NULL THEN TRUE END
) AS is_active,
MAX(active_profile.activation_at) AS activation_date,
MAX(active_profile.last_active_at) AS last_activate_at,
MIN(structured_events.event_created_at) AS first_event_at,
MAX(structured_events.event_created_at) AS last_event_at
FROM {{ ref('structured_events') }}
LEFT JOIN
{{ ref('cmd_parsed_all') }} ON
structured_events.command = cmd_parsed_all.command
LEFT JOIN
active_projects ON structured_events.project_id = active_projects.project_id
active_profile ON structured_events.project_id = active_profile.project_id
GROUP BY 1
260 changes: 260 additions & 0 deletions data/transform/models/marts/telemetry/cohorts.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
WITH base AS (
SELECT
structured_events.project_id,
structured_events.event_id,
structured_events.event_count,
structured_events.event_created_at,
projects.first_event_at,
projects.activation_date,
projects.last_activate_at,
COALESCE(structured_events.event_created_at >= projects.activation_date
AND structured_events.event_created_at <= projects.last_activate_at,
FALSE) AS is_active
FROM {{ ref('structured_events') }}
LEFT JOIN
{{ ref('projects') }} ON
structured_events.project_id = projects.project_id
),

plugin_categories_agg AS (

SELECT
'week' AS period_grain,
'active' AS agg_type,
DATE_TRUNC(
'week', base.event_created_at
) AS period_date,
base.project_id,
COUNT(DISTINCT fact_plugin_usage.plugin_category) AS plugin_distinct_cnt
FROM base
INNER JOIN {{ ref('fact_plugin_usage') }}
ON base.event_id = fact_plugin_usage.event_id
-- Active
WHERE base.is_active
GROUP BY 1, 2, 3, 4

UNION ALL

SELECT
'week' AS period_grain,
'active_prod' AS agg_type,
DATE_TRUNC(
'week', base.event_created_at
) AS period_date,
base.project_id,
COUNT(
DISTINCT CASE
WHEN
environments.env_name = 'prod'
THEN fact_plugin_usage.plugin_category
END
) AS plugin_distinct_cnt
FROM base
INNER JOIN {{ ref('fact_plugin_usage') }}
ON base.event_id = fact_plugin_usage.event_id
LEFT JOIN
{{ ref('environments') }} ON
base.event_id = environments.event_id
-- Active
WHERE base.is_active
GROUP BY 1, 2, 3, 4

UNION ALL

SELECT
'week' AS period_grain,
'active_dev' AS agg_type,
DATE_TRUNC(
'week', base.event_created_at
) AS period_date,
base.project_id,
COUNT(
DISTINCT CASE
WHEN
environments.env_name = 'dev'
THEN fact_plugin_usage.plugin_category
END
) AS plugin_distinct_cnt
FROM base
INNER JOIN {{ ref('fact_plugin_usage') }}
ON base.event_id = fact_plugin_usage.event_id
LEFT JOIN
{{ ref('environments') }} ON
base.event_id = environments.event_id
WHERE base.is_active
GROUP BY 1, 2, 3, 4

),

cohort_snapshots AS (

-- Event usage for all projects at the month grain
SELECT
'month' AS period_grain,
DATE_TRUNC(
'month', event_created_at
) AS snapshot_period,
DATE_TRUNC('month', first_event_at) AS cohort_id,
DATEDIFF(
MONTH,
DATE_TRUNC('month', first_event_at),
DATE_TRUNC('month', event_created_at)
) AS cohort_number,
'event_volume' AS cohort_type,
SUM(event_count) AS snapshot_value
FROM base
GROUP BY 1, 2, 3, 4, 5

UNION ALL

-- Active projects at the month grain
SELECT
'month' AS period_grain,
DATE_TRUNC(
'month', event_created_at
) AS snapshot_period,
DATE_TRUNC('month', first_event_at) AS cohort_id,
DATEDIFF(
MONTH,
DATE_TRUNC('month', first_event_at),
DATE_TRUNC('month', event_created_at)
) AS cohort_number,
'active_projects' AS cohort_type,
COUNT(DISTINCT project_id) AS snapshot_value
FROM base
WHERE is_active
GROUP BY 1, 2, 3, 4, 5

UNION ALL

-- APP for active projects at the week grain
SELECT
'week' AS period_grain,
DATE_TRUNC(
'week', base.event_created_at
) AS snapshot_period,
DATE_TRUNC('week', base.first_event_at) AS cohort_id,
DATEDIFF(
WEEK,
DATE_TRUNC('week', base.first_event_at),
DATE_TRUNC('week', base.event_created_at)
) AS cohort_number,
'APP_active' AS cohort_type,
AVG(plugin_categories_agg.plugin_distinct_cnt) AS snapshot_value
FROM base
LEFT JOIN
plugin_categories_agg ON
base.project_id = plugin_categories_agg.project_id
AND DATE_TRUNC(
'week', base.event_created_at
) = plugin_categories_agg.period_date
WHERE base.is_active
AND plugin_categories_agg.period_grain = 'week'
AND plugin_categories_agg.agg_type = 'active'
GROUP BY 1, 2, 3, 4, 5

UNION ALL

-- APP for active projects at the month grain
SELECT
'month' AS period_grain,
DATE_TRUNC(
'month', base.event_created_at
) AS snapshot_period,
DATE_TRUNC('month', base.first_event_at) AS cohort_id,
DATEDIFF(
MONTH,
DATE_TRUNC('month', base.first_event_at),
DATE_TRUNC('month', base.event_created_at)
) AS cohort_number,
'APP_active' AS cohort_type,
AVG(plugin_categories_agg.plugin_distinct_cnt) AS snapshot_value
FROM base
LEFT JOIN
plugin_categories_agg ON
base.project_id = plugin_categories_agg.project_id
AND DATE_TRUNC(
'month', base.event_created_at
) = plugin_categories_agg.period_date
WHERE base.is_active
AND plugin_categories_agg.period_grain = 'month'
AND plugin_categories_agg.agg_type = 'active'
GROUP BY 1, 2, 3, 4, 5

UNION ALL

-- APP for active projects using the prod environment at the week grain
SELECT
'week' AS period_grain,
DATE_TRUNC(
'week', base.event_created_at
) AS snapshot_period,
DATE_TRUNC('week', base.first_event_at) AS cohort_id,
DATEDIFF(
WEEK,
DATE_TRUNC('week', base.first_event_at),
DATE_TRUNC('week', base.event_created_at)
) AS cohort_number,
'APP_active_prod' AS cohort_type,
AVG(plugin_categories_agg.plugin_distinct_cnt) AS snapshot_value
FROM base
LEFT JOIN
plugin_categories_agg ON
base.project_id = plugin_categories_agg.project_id
AND DATE_TRUNC(
'week', base.event_created_at
) = plugin_categories_agg.period_date
WHERE base.is_active
AND plugin_categories_agg.period_grain = 'week'
AND plugin_categories_agg.agg_type = 'active_prod'
GROUP BY 1, 2, 3, 4, 5

UNION ALL

-- APP for active projects using the dev environment at the week grain
SELECT
'week' AS period_grain,
DATE_TRUNC(
'week', base.event_created_at
) AS snapshot_period,
DATE_TRUNC('week', base.first_event_at) AS cohort_id,
DATEDIFF(
WEEK,
DATE_TRUNC('week', base.first_event_at),
DATE_TRUNC('week', base.event_created_at)
) AS cohort_number,
'APP_active_dev' AS cohort_type,
AVG(plugin_categories_agg.plugin_distinct_cnt) AS snapshot_value
FROM base
LEFT JOIN
plugin_categories_agg ON
base.project_id = plugin_categories_agg.project_id
AND DATE_TRUNC(
'week', base.event_created_at
) = plugin_categories_agg.period_date
WHERE base.is_active
AND plugin_categories_agg.period_grain = 'week'
AND plugin_categories_agg.agg_type = 'active_dev'
GROUP BY 1, 2, 3, 4, 5

),

originals AS (
SELECT
cohort_id,
snapshot_value,
cohort_type
FROM cohort_snapshots
WHERE cohort_id = snapshot_period
)

SELECT
cohort_snapshots.period_grain,
cohort_snapshots.cohort_id,
cohort_snapshots.snapshot_period,
cohort_snapshots.cohort_type,
cohort_snapshots.snapshot_value,
cohort_snapshots.cohort_number,
originals.snapshot_value AS original_snapshot_value
FROM cohort_snapshots
LEFT JOIN originals ON cohort_snapshots.cohort_id = originals.cohort_id
4 changes: 3 additions & 1 deletion data/transform/models/marts/telemetry/fact_plugin_usage.sql
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@ SELECT
-- projects
structured_events.project_id,
projects.first_event_at AS project_created_at,
projects.is_active AS project_is_active,
COALESCE(projects.last_activate_at >= DATEADD(
'month', -1, CURRENT_DATE()
), FALSE) AS project_is_active,
-- environments
cmd_parsed_all.environment AS env_id,
environments.env_name,
Expand Down