Skip to content

Commit

Permalink
Merge branch 'main' of github.com:HTTPArchive/almanac.httparchive.org…
Browse files Browse the repository at this point in the history
… into production
  • Loading branch information
tunetheweb committed Nov 21, 2024
2 parents 118947a + 9855e25 commit 8800cfc
Show file tree
Hide file tree
Showing 15 changed files with 322 additions and 23 deletions.
6 changes: 3 additions & 3 deletions sql/2024/cookies/0_extract_cookies.sql
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,11 @@ WITH intermediate_cookie AS (
page,
root_page,
rank,
JSON_VALUE(summary, '$.startedDateTime') AS startedDateTime,
payload.startedDateTime AS startedDateTime,
cookie
FROM
`httparchive.all.pages`,
UNNEST(JSON_EXTRACT_ARRAY(custom_metrics, '$.cookies')) AS cookie
`httparchive.crawl.pages`,
UNNEST(JSON_EXTRACT_ARRAY(custom_metrics.cookies)) AS cookie
WHERE
date = '2024-06-01'
)
Expand Down
91 changes: 91 additions & 0 deletions sql/2024/third-parties/depth_of_gtm_calls.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
CREATE TEMP FUNCTION findAllInitiators(rootPage STRING, data ARRAY<STRUCT<root_page STRING, third_party STRING, initiator_etld STRING>>)
RETURNS ARRAY<STRING>
LANGUAGE js AS """
// Helper function to find all initiator_etlds for a given root_page
function findInitiators(page, visited, data) {
// Find all entries where the root_page matches and the initiator_etld hasn't been visited
const initiators = data
.filter(row => row.root_page === page && !visited.includes(row.initiator_etld))
.map(row => row.initiator_etld);
// Add the newly found initiators to the visited list
visited = visited.concat(initiators);
// Recursively process all new initiators
initiators.forEach(initiator => {
visited = findInitiators(initiator, visited, data);
});
return visited;
}
// Main call: Start recursion from the rootPage
// Use a Set to ensure that all returned values are distinct
return Array.from(new Set(findInitiators(rootPage, [], data)));
""";



CREATE TEMP FUNCTION mean_depth_and_next_element_after_gtm(input_array ARRAY<STRING>)
RETURNS STRUCT<mean_depth FLOAT64, next_elements ARRAY<STRING>>
LANGUAGE js AS """
// Initialize the array to hold names of next elements
const nextElements = [];
// Traverse the input array to find "googletagmanager.com" and capture the next element
for (let i = 0; i < input_array.length - 1; i++) { // -1 to avoid out-of-bounds
if (input_array[i] === 'googletagmanager.com') {
nextElements.push(input_array[i + 1]);
}
}
// If no "googletagmanager.com" is found, return NULL
if (nextElements.length === 0) {
return { mean_depth: null, next_elements: [] };
}
// Calculate mean depth for all next elements
const meanDepth = nextElements.length > 0
? nextElements.reduce((sum, _, idx) => sum + (idx + 2), 0) / nextElements.length
: null;
// Return the result as a struct
return { mean_depth: meanDepth, next_elements: nextElements };
""";


WITH data AS (
-- TP interact with other tps
SELECT
*
FROM (
SELECT
client,
NET.REG_DOMAIN(root_page) AS root_page,
NET.REG_DOMAIN(url) AS third_party,
NET.REG_DOMAIN(JSON_VALUE(payload, '$._initiator')) AS initiator_etld
FROM
`httparchive.all.requests`
WHERE
NET.REG_DOMAIN(root_page) != NET.REG_DOMAIN(url) AND
date = '2024-06-01')
WHERE third_party != initiator_etld AND
root_page != initiator_etld
GROUP BY client, root_page, third_party, initiator_etld
)

SELECT client, next_elements_after_gtm, count(0) AS c FROM(
SELECT
client,
result.mean_depth AS mean_depth_after_gtm,
result.next_elements AS next_elements_after_gtm
FROM (
SELECT
root_page,
client,
findAllInitiators(root_page, ARRAY_AGG(STRUCT(root_page, third_party, initiator_etld))) AS all_initiators
FROM data
GROUP BY root_page, client),
UNNEST([mean_depth_and_next_element_after_gtm(all_initiators)]) AS result
WHERE result.mean_depth IS NOT NULL
ORDER BY mean_depth_after_gtm) GROUP BY client, next_elements_after_gtm ORDER BY c;
3 changes: 1 addition & 2 deletions src/config/2024.json
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,7 @@
"part": "I",
"chapter_number": "8",
"title": "Third Parties",
"slug": "third-parties",
"todo": true
"slug": "third-parties"
}
]
},
Expand Down
35 changes: 32 additions & 3 deletions src/config/contributors.json
Original file line number Diff line number Diff line change
Expand Up @@ -678,7 +678,8 @@
"2024": [
"developers",
"editors",
"committee"
"committee",
"reviewers"
]
},
"twitter": "tunetheweb",
Expand Down Expand Up @@ -1014,9 +1015,11 @@
},
"ChrisBeeti": {
"github": "ChrisBeeti",
"name": "Chris Beeti",
"name": "Chris Böttger",
"teams": {
"2024": [
"analysts",
"authors",
"committee"
]
}
Expand Down Expand Up @@ -4428,9 +4431,11 @@
"name": "Tobias Urban",
"teams": {
"2024": [
"authors",
"committee"
]
}
},
"website": "https://internet-sicherheit.de/ueber-uns/team/alle-mitarbeiter/urban-tobias-2/"
},
"bobbyshaw": {
"avatar_url": "553566",
Expand Down Expand Up @@ -4690,6 +4695,19 @@
]
}
},
"Yash-Vekaria": {
"avatar_url": "30694521",
"github": "Yash-Vekaria",
"name": "Yash Vekaria",
"teams": {
"2024": [
"analysts",
"authors"
]
},
"twitter": "vekariayash",
"website": "https://yash-vekaria.github.io"
},
"yoavweiss": {
"avatar_url": "786187",
"github": "yoavweiss",
Expand Down Expand Up @@ -4811,6 +4829,17 @@
]
}
},
"zubairshafiq": {
"github": "zubairshafiq",
"name": "Zubair Shafiq",
"teams": {
"2024": [
"authors"
]
},
"twitter": "zubair_shafiq",
"website": "http://www.cs.ucdavis.edu/~zubair"
},
"Zuckjet": {
"avatar_url": "17976139",
"github": "Zuckjet",
Expand Down
6 changes: 3 additions & 3 deletions src/config/last_updated.json
Original file line number Diff line number Diff line change
Expand Up @@ -852,9 +852,9 @@
"hash": "bb49d876d3e33811819746edc96ed447"
},
"en/2024/chapters/third-parties.html": {
"date_published": "2024-11-11T00:00:00.000Z",
"date_modified": "2024-11-16T00:00:00.000Z",
"hash": "124fe4e80189dd401c4f4d0bfeb361dd"
"date_published": "2024-11-21T00:00:00.000Z",
"date_modified": "2024-11-21T00:00:00.000Z",
"hash": "075bec99b73be68c6fa7b97b97808182"
},
"en/2024/chapters/webassembly.html": {
"date_published": "2024-11-11T00:00:00.000Z",
Expand Down
Loading

0 comments on commit 8800cfc

Please sign in to comment.