Skip to content

Commit

Permalink
Merge branch 'main' of github.com:HTTPArchive/almanac.httparchive.org…
Browse files Browse the repository at this point in the history
… into production
  • Loading branch information
tunetheweb committed Oct 15, 2024
2 parents 5ff6eda + f432f71 commit 518c197
Show file tree
Hide file tree
Showing 92 changed files with 2,039 additions and 79 deletions.
61 changes: 61 additions & 0 deletions sql/2024/structured-data/classic_microformats_types.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# standardSQL
# classic_microformats_types.sql
# Count Classic Microformats types

CREATE TEMP FUNCTION getClassicMicroformatsTypes(rendered STRING)
RETURNS ARRAY<STRUCT<name STRING, count NUMERIC>>
LANGUAGE js AS """
try {
rendered = JSON.parse(rendered);
return rendered.microformats_classic_types.map(microformats_classic_type => ({name: microformats_classic_type.name, count: microformats_classic_type.count}));
} catch (e) {
return [];
}
""";

WITH
rendered_data AS (
SELECT
client,
root_page AS url,
getClassicMicroformatsTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS classic_microformats_types
FROM
`httparchive.all.pages`
WHERE
date = '2024-06-01'
),

page_totals AS (
SELECT
client,
COUNT(DISTINCT root_page) AS total_pages
FROM
`httparchive.all.pages`
WHERE
date = '2024-06-01'
GROUP BY
client
)

SELECT
client,
classic_microformats_type.name AS classic_microformats_type,
SUM(classic_microformats_type.count) AS freq_microformat,
SUM(SUM(classic_microformats_type.count)) OVER (PARTITION BY client) AS total_microformat,
SUM(classic_microformats_type.count) / SUM(SUM(classic_microformats_type.count)) OVER (PARTITION BY client) AS pct_microformat,
COUNT(DISTINCT url) AS freq_pages,
total_pages,
COUNT(DISTINCT url) / total_pages AS pct_pages
FROM
rendered_data,
UNNEST(classic_microformats_types) AS classic_microformats_type
JOIN
page_totals
USING (client)
GROUP BY
client,
classic_microformats_type,
total_pages
ORDER BY
freq_microformat DESC,
client
60 changes: 60 additions & 0 deletions sql/2024/structured-data/dublin_core_types.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# standardSQL
# dublin_core_types.sql
# Count Dublin Core types
CREATE TEMP FUNCTION getDublinCoreTypes(rendered STRING)
RETURNS ARRAY<STRING>
LANGUAGE js AS """
try {
rendered = JSON.parse(rendered);
return rendered.dublin_core.map(dublin_core => dublin_core.name.toLowerCase());
} catch (e) {
return [];
}
""";

WITH
rendered_data AS (
SELECT
client,
root_page AS url,
getDublinCoreTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS dublin_core_types
FROM
`httparchive.all.pages`
WHERE
date = '2024-06-01'
),

page_totals AS (
SELECT
client,
COUNT(DISTINCT root_page) AS total_pages
FROM
`httparchive.all.pages`
WHERE
date = '2024-06-01'
GROUP BY
client
)

SELECT
client,
dublin_core_type,
COUNT(dublin_core_type) AS count,
SUM(COUNT(dublin_core_type)) OVER (PARTITION BY client) AS freq_dublin_core,
COUNT(dublin_core_type) / SUM(COUNT(dublin_core_type)) OVER (PARTITION BY client) AS pct_dublin_core,
COUNT(DISTINCT url) AS freq_pages,
total_pages,
COUNT(DISTINCT url) / total_pages AS pct_pages
FROM
rendered_data,
UNNEST(dublin_core_types) AS dublin_core_type
JOIN
page_totals
USING (client)
GROUP BY
client,
dublin_core_type,
total_pages
ORDER BY
pct_dublin_core DESC,
client
60 changes: 60 additions & 0 deletions sql/2024/structured-data/facebook_types.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# standardSQL
# facebook_types.sql
# Count Facebook types
CREATE TEMP FUNCTION getFacebookTypes(rendered STRING)
RETURNS ARRAY<STRING>
LANGUAGE js AS """
try {
rendered = JSON.parse(rendered);
return rendered.facebook.map(facebook => facebook.property.toLowerCase());
} catch (e) {
return [];
}
""";

WITH
rendered_data AS (
SELECT
client,
root_page AS url,
getFacebookTypes(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS facebook_type
FROM
`httparchive.all.pages`
WHERE
date = '2024-06-01'
),

page_totals AS (
SELECT
client,
COUNT(DISTINCT root_page) AS total_pages
FROM
`httparchive.all.pages`
WHERE
date = '2024-06-01'
GROUP BY
client
)

SELECT
client,
facebook_type,
COUNT(facebook_type) AS freq_facebook,
SUM(COUNT(facebook_type)) OVER (PARTITION BY client) AS total_facebook,
COUNT(facebook_type) / SUM(COUNT(facebook_type)) OVER (PARTITION BY client) AS pct_facebook,
COUNT(DISTINCT url) AS freq_pages,
total_pages,
COUNT(DISTINCT url) / total_pages AS pct_pages
FROM
rendered_data,
UNNEST(facebook_type) AS facebook_type
JOIN
page_totals
USING (client)
GROUP BY
client,
facebook_type,
total_pages
ORDER BY
freq_facebook DESC,
client
79 changes: 79 additions & 0 deletions sql/2024/structured-data/jsonld_contexts.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# standardSQL
# jsonld_contexts.sql
# Count JSON-LD contexts
CREATE TEMP FUNCTION getJSONLDContexts(rendered STRING)
RETURNS ARRAY<STRING>
LANGUAGE js AS """
try {
const arrayify = (value) => Array.isArray(value) ? value : [value];
const getDeep = (key, o) => {
if (Array.isArray(o)) return o.map(child => getDeep(key, child)).flat();
if (o instanceof Object) {
return Object.entries(o).map(([k, value]) => {
if (k === key) return [...arrayify(value), ...getDeep(value)];
return getDeep(value);
}).flat();
}
return [];
}
rendered = JSON.parse(rendered);
const jsonld_scripts = rendered.jsonld_scripts;
return jsonld_scripts.map(jsonld_script => {
jsonld_script = JSON.parse(jsonld_script);
return getDeep('@context', jsonld_script);
}).flat().filter(context => typeof context === 'string');
} catch (e) {
return [];
}
""";

WITH
rendered_data AS (
SELECT
client,
root_page AS url,
getJSONLDContexts(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS jsonld_context
FROM
`httparchive.all.pages`
WHERE
date = '2024-06-01'
),

page_totals AS (
SELECT
client,
COUNT(DISTINCT root_page) AS total_pages
FROM
`httparchive.all.pages`
WHERE
date = '2024-06-01'
GROUP BY
client
)

SELECT
client,
NET.REG_DOMAIN(jsonld_context) AS jsonld_context,
COUNT(0) AS freq_jsonld_context,
SUM(COUNT(0)) OVER (PARTITION BY client) AS total_jsonld_context,
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_jsonld_context,
COUNT(DISTINCT url) AS freq_pages,
total_pages,
COUNT(DISTINCT url) / total_pages AS pct_pages
FROM
rendered_data,
UNNEST(jsonld_context) AS jsonld_context
JOIN
page_totals
USING (client)
GROUP BY
client,
jsonld_context,
total_pages
ORDER BY
pct_jsonld_context DESC,
client
70 changes: 70 additions & 0 deletions sql/2024/structured-data/jsonld_depth_percentiles.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# standardSQL
# jsonld_depth_percentiles.sql
# Find the most nested entity in a JSON-LD document
CREATE TEMP FUNCTION getJSONLDEntitiesRelationships(rendered STRING)
RETURNS ARRAY<STRUCT<_from STRING, relationship STRING, _to STRING, depth NUMERIC>>
LANGUAGE js AS """
try {
const types = new Map();
const loadTypes = (o) => {
if (Array.isArray(o)) {
o.forEach(loadTypes);
} else if (o instanceof Object) {
if (o['@id'] && o['@type']) {
types.set(o['@id'], o['@type']);
}
Object.values(o).forEach(loadTypes);
}
}
const arrayify = (value) => Array.isArray(value) ? value : [value];
const getEntitiesAndRelationships = (o, _from, relationship, depth = 0) => {
if (Array.isArray(o)) return o.map(value => getEntitiesAndRelationships(value, _from, relationship, depth)).flat();
if (o instanceof Object) {
const type = types.get(o['@id']) || o['@type'];
return [{_from, relationship, _to: type, depth}, ...Object.entries(o).map(([k, value]) => getEntitiesAndRelationships(value, type, k, depth + 1))].flat();
}
return [];
}
rendered = JSON.parse(rendered);
const jsonld_scripts = rendered.jsonld_scripts.map(JSON.parse);
loadTypes(jsonld_scripts);
return jsonld_scripts.map(jsonld_script => getEntitiesAndRelationships(jsonld_script, undefined, undefined, 0)).flat();
} catch (e) {
return [];
}
""";

WITH rendered_data AS (
SELECT
client,
root_page AS url,
getJSONLDEntitiesRelationships(JSON_EXTRACT(JSON_VALUE(JSON_EXTRACT(payload, '$._structured-data')), '$.structured_data.rendered')) AS jsonld_entities_relationships
FROM
`httparchive.all.pages`
WHERE
date = '2024-06-01'
)

SELECT
client,
percentile,
APPROX_QUANTILES(jsonld_entity_relationship.depth, 1000)[OFFSET(percentile * 10)] AS depth,
ARRAY_TO_STRING(ARRAY_AGG(DISTINCT url LIMIT 5), ' ') AS sample_urls
FROM
rendered_data,
UNNEST(jsonld_entities_relationships) AS jsonld_entity_relationship,
UNNEST([10, 25, 50, 75, 90, 100]) AS percentile
GROUP BY
client,
percentile
ORDER BY
client,
percentile
Loading

0 comments on commit 518c197

Please sign in to comment.