From 2b61ed0db9f23d820156a5cee6c989052fe24ff2 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Wed, 20 Nov 2024 17:08:00 +0100 Subject: [PATCH 1/3] cleanup --- definitions/extra/migration/backfill_pages.js | 402 ------------------ .../extra/migration/backfill_requests.js | 336 --------------- .../extra/migration/backfill_summary_pages.js | 221 ---------- .../migration/backfill_summary_requests.js | 277 ------------ .../extra/migration/reprocess_pages.js | 223 ---------- .../extra/migration/reprocess_requests.js | 99 ----- definitions/extra/test_env.js | 28 -- definitions/output/all/pages.js | 45 -- definitions/output/all/parsed_css.js | 16 - definitions/output/all/requests.js | 53 --- definitions/output/lighthouse.js | 20 - definitions/output/pages.js | 19 - definitions/output/requests.js | 21 - definitions/output/response_bodies.js | 20 - definitions/output/summary_pages.js | 108 ----- definitions/output/summary_requests.js | 80 ---- definitions/output/technologies.js | 24 -- workflow_settings.yaml | 2 +- 18 files changed, 1 insertion(+), 1993 deletions(-) delete mode 100644 definitions/extra/migration/backfill_pages.js delete mode 100644 definitions/extra/migration/backfill_requests.js delete mode 100644 definitions/extra/migration/backfill_summary_pages.js delete mode 100644 definitions/extra/migration/backfill_summary_requests.js delete mode 100644 definitions/extra/migration/reprocess_pages.js delete mode 100644 definitions/extra/migration/reprocess_requests.js delete mode 100644 definitions/extra/test_env.js delete mode 100644 definitions/output/all/pages.js delete mode 100644 definitions/output/all/parsed_css.js delete mode 100644 definitions/output/all/requests.js delete mode 100644 definitions/output/lighthouse.js delete mode 100644 definitions/output/pages.js delete mode 100644 definitions/output/requests.js delete mode 100644 definitions/output/response_bodies.js delete mode 100644 definitions/output/summary_pages.js delete mode 100644 definitions/output/summary_requests.js delete mode 100644 definitions/output/technologies.js diff --git a/definitions/extra/migration/backfill_pages.js b/definitions/extra/migration/backfill_pages.js deleted file mode 100644 index 542bb5ff..00000000 --- a/definitions/extra/migration/backfill_pages.js +++ /dev/null @@ -1,402 +0,0 @@ -const iterations = [] -const clients = constants.clients - -operate('backfill') - -let midMonth -for ( - let date = '2020-02-01'; - date >= '2020-02-01'; - date = constants.fnPastMonth(date) -) { - clients.forEach((client) => { - iterations.push({ - date, - client - }) - }) - - if (date <= '2018-12-01') { - midMonth = new Date(date) - midMonth.setDate(15) - midMonth = midMonth.toISOString().substring(0, 10) - - clients.forEach((client) => { - iterations.push({ - date: midMonth, - client - }) - }) - } -} - -function lighthouseReport (date, client) { - if (date >= '2017-06-01' && client === 'mobile') { - return { - join: ` -LEFT JOIN ( - SELECT - url, - SAFE.PARSE_JSON(report, wide_number_mode => 'round') AS report - FROM lighthouse.${constants.fnDateUnderscored(date)}_mobile ${constants.devTABLESAMPLE} -) AS lighthouse -ON pages.url = lighthouse.url;`, - column: 'lighthouse.report' - } - } - - return { - join: ';', - column: 'NULL' - } -} - -iterations.forEach((iteration, i) => { - operate(`backfill_pages ${iteration.date} ${iteration.client}`).tags([ - 'backfill_pages' - ]).queries(ctx => ` -DELETE FROM crawl.pages -WHERE date = '${iteration.date}' - AND client = '${iteration.client}'; - -CREATE TEMP FUNCTION parseDetectedApps( - detected JSON, - detected_apps JSON -) -RETURNS ARRAY, info ARRAY>> -LANGUAGE js AS ''' - // Initialize the result array to store the structs - const result = [] - - // Loop through each entry in detected_apps - for (const technology in detected_apps) { - const info = detected_apps[technology] ? [detected_apps[technology]] : [] - const categories = [] - - // Search for this technology in each category of detected - for (const category in detected) { - if (detected[category].includes(technology)) { - categories.push(category) - } - } - - // Add a struct with the technology, its categories, and info - result.push({ - technology: technology, - categories: categories, - info: info - }) - } - - return result -'''; - -CREATE TEMPORARY FUNCTION getOtherCustomMetrics( - payload JSON, - keys ARRAY -) -RETURNS JSON -LANGUAGE js AS r''' -try { - let otherMetrics = {} - let value = null - keys.forEach(function (key) { - try { - value = JSON.parse(payload[key].replace(/\\\\u[a-f0-9]{4}/g, '')) - } catch (e) { - value = payload[key] - } - otherMetrics[key.substr(1)] = value - }) - return otherMetrics -} catch (e) { - return null -} -'''; - -CREATE TEMP FUNCTION getFeatures(blinkFeatureFirstUsed JSON) -RETURNS ARRAY> -LANGUAGE js AS r''' -function getFeatureNames(featureMap, featureType) { - try { - return Object.entries(featureMap).map(([key, value]) => { - // After Feb 2020 keys are feature IDs. - if (value.name) { - return {'feature': value.name, 'type': featureType, 'id': key} - } - - // Prior to Feb 2020 keys fell back to IDs if the name was unknown. - if (idPattern.test(key)) { - return {'feature': '', 'type': featureType, 'id': key.match(idPattern)[1]} - } - - // Prior to Feb 2020 keys were names by default. - return {'feature': key, 'type': featureType, 'id': ''} - }) - } catch (e) { - return [] - } -} - -if (!blinkFeatureFirstUsed) return [] - -var idPattern = new RegExp('^Feature_(\\d+)$') -return getFeatureNames(blinkFeatureFirstUsed.Features, 'default') - .concat(getFeatureNames(blinkFeatureFirstUsed.CSSFeatures, 'css')) - .concat(getFeatureNames(blinkFeatureFirstUsed.AnimatedCSSFeatures, 'animated-css')) -'''; - -INSERT INTO crawl.pages -SELECT - DATE('${iteration.date}') AS date, - '${iteration.client}' AS client, - pages.url AS page, - TRUE AS is_root_page, - pages.url AS root_page, - COALESCE( - crux.rank, - CASE - WHEN summary_pages.rank = 0 THEN NULL - WHEN summary_pages.rank <= 1000 THEN 1000 - WHEN summary_pages.rank <= 5000 THEN 5000 - WHEN summary_pages.rank <= 10000 THEN 10000 - WHEN summary_pages.rank <= 50000 THEN 50000 - WHEN summary_pages.rank <= 100000 THEN 100000 - WHEN summary_pages.rank <= 500000 THEN 500000 - WHEN summary_pages.rank <= 1000000 THEN 1000000 - WHEN summary_pages.rank <= 5000000 THEN 5000000 - WHEN summary_pages.rank <= 10000000 THEN 10000000 - WHEN summary_pages.rank <= 50000000 THEN 50000000 - ELSE NULL - END - ) AS rank, - summary_pages.wptid, - JSON_REMOVE( - payload, - '$._metadata', - '$._detected', - '$._detected_apps', - '$._detected_technologies', - '$._detected_raw', - '$._custom', - '$._00_reset', - '$._a11y', - '$._ads', - '$._almanac', - '$._aurora', - '$._avg_dom_depth', - '$._cms', - '$._Colordepth', - '$._cookies', - '$._crawl_links', - '$._css-variables', - '$._css', - '$._doctype', - '$._document_height', - '$._document_width', - '$._Dpi', - '$._ecommerce', - '$._element_count', - '$._event-names', - '$._fugu-apis', - '$._generated-content', - '$._has_shadow_root', - '$._Images', - '$._img-loading-attr', - '$._initiators', - '$._inline_style_bytes', - '$._javascript', - '$._lib-detector-version', - '$._localstorage_size', - '$._markup', - '$._media', - '$._meta_viewport', - '$._num_iframes', - '$._num_scripts_async', - '$._num_scripts_sync', - '$._num_scripts', - '$._observers', - '$._origin-trials', - '$._parsed_css', - '$._performance', - '$._privacy-sandbox', - '$._privacy', - '$._pwa', - '$._quirks_mode', - '$._Resolution', - '$._responsive_images', - '$._robots_meta', - '$._robots_txt', - '$._sass', - '$._security', - '$._sessionstorage_size', - '$._structured-data', - '$._third-parties', - '$._usertiming', - '$._valid-head', - '$._well-known', - '$._wpt_bodies', - '$._blinkFeatureFirstUsed', - '$._CrUX' - ) AS payload, - TO_JSON( STRUCT( - SpeedIndex, - TTFB, - _connections, - bytesAudio, - bytesCSS, - bytesFlash, - bytesFont, - bytesGif, - bytesHtml, - bytesHtmlDoc, - bytesImg, - bytesJpg, - bytesJS, - bytesJson, - bytesOther, - bytesPng, - bytesSvg, - bytesText, - bytesTotal, - bytesVideo, - bytesWebp, - bytesXml, - cdn, - payload._CrUX AS crux, - fullyLoaded, - gzipSavings, - gzipTotal, - maxDomainReqs, - maxage0, - maxage1, - maxage30, - maxage365, - maxageMore, - maxageNull, - numCompressed, - numDomElements, - numDomains, - numErrors, - numGlibs, - numHttps, - numRedirects, - onContentLoaded, - onLoad, - renderStart, - reqAudio, - reqCSS, - reqFlash, - reqFont, - reqGif, - reqHtml, - reqImg, - reqJpg, - reqJS, - reqJson, - reqOther, - reqPng, - reqSvg, - reqText, - reqTotal, - reqVideo, - reqWebp, - reqXml, - visualComplete - )) AS summary, - STRUCT< - a11y JSON, - cms JSON, - cookies JSON, - css_variables JSON, - ecommerce JSON, - element_count JSON, - javascript JSON, - markup JSON, - media JSON, - origin_trials JSON, - performance JSON, - privacy JSON, - responsive_images JSON, - robots_txt JSON, - security JSON, - structured_data JSON, - third_parties JSON, - well_known JSON, - wpt_bodies JSON, - other JSON - >( - SAFE.PARSE_JSON(JSON_VALUE(payload, "$._a11y"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(payload, "$._cms"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(payload, "$._cookies"), wide_number_mode => 'round'), - payload["_css-variables"], - SAFE.PARSE_JSON(JSON_VALUE(payload, "$._ecommerce"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(payload, "$._element_count"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(payload, "$._javascript"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(payload, "$._markup"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(payload, "$._media"), wide_number_mode => 'round'), - payload["_origin-trials"], - payload._performance, - SAFE.PARSE_JSON(JSON_VALUE(payload, "$._privacy"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(payload, "$._responsive_images"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(payload, "$._robots_txt"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(payload, "$._security"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(payload, "$._structured-data"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(payload, "$._third-parties"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(payload, "$._well-known"), wide_number_mode => 'round'), - SAFE.PARSE_JSON(JSON_VALUE(payload, "$._wpt_bodies"), wide_number_mode => 'round'), - TO_JSON(STRUCT( - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._Colordepth'), wide_number_mode => 'round') AS Colordepth, - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._Dpi'), wide_number_mode => 'round') AS Dpi, - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._Images'), wide_number_mode => 'round') AS Images, - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._Resolution'), wide_number_mode => 'round') AS Resolution, - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._almanac'), wide_number_mode => 'round') AS almanac, - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._avg_dom_depth'), wide_number_mode => 'round') AS avg_dom_depth, - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._css'), wide_number_mode => 'round') AS css, - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._doctype'), wide_number_mode => 'round') AS doctype, - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._document_height'), wide_number_mode => 'round') AS document_height, - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._document_width'), wide_number_mode => 'round') AS document_width, - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._event-names'), wide_number_mode => 'round') AS \`event-names\`, - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._fugu-apis'), wide_number_mode => 'round') AS \`fugu-apis\`, - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._has_shadow_root'), wide_number_mode => 'round') AS has_shadow_root, - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._img-loading-attr'), wide_number_mode => 'round') AS \`img-loading-attr\`, - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._initiators'), wide_number_mode => 'round') AS initiators, - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._inline_style_bytes'), wide_number_mode => 'round') AS inline_style_bytes, - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._lib-detector-version'), wide_number_mode => 'round') AS \`lib-detector-version\`, - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._localstorage_size'), wide_number_mode => 'round') AS localstorage_size, - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._meta_viewport'), wide_number_mode => 'round') AS meta_viewport, - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._num_iframes'), wide_number_mode => 'round') AS num_iframes, - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._num_scripts'), wide_number_mode => 'round') AS num_scripts, - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._num_scripts_async'), wide_number_mode => 'round') AS num_scripts_async, - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._num_scripts_sync'), wide_number_mode => 'round') AS num_scripts_sync, - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._pwa'), wide_number_mode => 'round') AS pwa, - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._quirks_mode'), wide_number_mode => 'round') AS quirks_mode, - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._sass'), wide_number_mode => 'round') AS sass, - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._sessionstorage_size'), wide_number_mode => 'round') AS sessionstorage_size, - SAFE.PARSE_JSON(JSON_VALUE(payload, '$._usertiming'), wide_number_mode => 'round') AS usertiming - )) - ) AS custom_metrics, - ${lighthouseReport(iteration.date, iteration.client).column} AS lighthouse, - getFeatures(payload._blinkFeatureFirstUsed) AS features, - parseDetectedApps(payload._detected, payload._detected_apps) AS technologies, - pages.payload._metadata AS metadata -FROM ( - SELECT - * EXCEPT(payload), - SAFE.PARSE_JSON(payload, wide_number_mode => 'round') AS payload - FROM \`pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client}\` ${constants.devTABLESAMPLE} -) AS pages - -LEFT JOIN summary_pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} AS summary_pages ${constants.devTABLESAMPLE} -ON pages.url = summary_pages.url - -LEFT JOIN ( - SELECT DISTINCT - CONCAT(origin, '/') AS page, - experimental.popularity.rank AS rank - FROM ${ctx.resolve('chrome-ux-report', 'experimental', 'global')} - WHERE yyyymm = ${constants.fnPastMonth(iteration.date).substring(0, 7).replace('-', '')} -) AS crux -ON pages.url = crux.page -${lighthouseReport(iteration.date, iteration.client).join}`) -}) diff --git a/definitions/extra/migration/backfill_requests.js b/definitions/extra/migration/backfill_requests.js deleted file mode 100644 index 2b530a49..00000000 --- a/definitions/extra/migration/backfill_requests.js +++ /dev/null @@ -1,336 +0,0 @@ -const iterations = [] -const clients = constants.clients - -let midMonth -for ( - let date = '2020-10-01'; - date >= '2020-01-01'; - date = constants.fnPastMonth(date) -) { - clients.forEach((client) => { - if ( - !date - ) { return true } else { - iterations.push({ - date, - client - }) - } - }) - - if (date <= '2018-12-01') { - midMonth = new Date(date) - midMonth.setDate(15) - midMonth = midMonth.toISOString().substring(0, 10) - - clients.forEach((client) => { - if ( - !midMonth - ) { return true } else { - iterations.push({ - date: midMonth, - client - }) - } - }) - } -} - -iterations.forEach((iteration, i) => { - operate(`backfill_requests ${iteration.date} ${iteration.client}`).tags([ - 'backfill_requests' - ]).queries(ctx => ` -DELETE FROM crawl.requests -WHERE date = '${iteration.date}' - AND client = '${iteration.client}'; - -CREATE TEMP FUNCTION getExtFromURL(url STRING) -RETURNS STRING -LANGUAGE js AS ''' -try { - let ret_ext = url; - - // Remove query parameters - const i_q = ret_ext.indexOf('?'); - if (i_q > -1) { - ret_ext = ret_ext.substring(0, i_q) - } - - // Get the last segment of the path after the last '/' - ret_ext = ret_ext.substring(ret_ext.lastIndexOf('/') + 1) - - // Find the position of the last dot - const i_dot = ret_ext.lastIndexOf('.') - - if (i_dot === -1) { - // No dot means no extension - ret_ext = '' - } else { - // Extract the extension - ret_ext = ret_ext.substring(i_dot + 1) - - // Weed out overly long extensions - if (ret_ext.length > 5) { - ret_ext = '' - } - } - - return ret_ext.toLowerCase() -} catch (e) { - return '' // Return an empty string in case of any errors -} -'''; - -CREATE TEMP FUNCTION prettyType(mimeTyp STRING, ext STRING) -RETURNS STRING -LANGUAGE js AS ''' -try { - mimeTyp = mimeTyp.toLowerCase() - - // Order by most unique first. - // Do NOT do html because 'text/html' is often misused for other types. We catch it below. - const types = ['font', 'css', 'image', 'script', 'video', 'audio', 'xml']; - for (const typ of types) { - if (mimeTyp.includes(typ)) { - return typ - } - } - - // Special cases found manually - if (ext === 'js') { - return 'script' - } else if (mimeTyp.includes('json') || ext === 'json') { - return 'json' - } else if (['eot', 'ttf', 'woff', 'woff2', 'otf'].includes(ext)) { - return 'font' - } else if (['png', 'gif', 'jpg', 'jpeg', 'webp', 'ico', 'svg', 'avif', 'jxl', 'heic', 'heif'].includes(ext)) { - return 'image' - } else if (ext === 'css') { - return 'css' - } else if (ext === 'xml') { - return 'xml' - } else if ( - ['flash', 'webm', 'mp4', 'flv'].some((typ) => mimeTyp.includes(typ)) || - ['mp4', 'webm', 'ts', 'm4v', 'm4s', 'mov', 'ogv', 'swf', 'f4v', 'flv'].includes(ext) - ) { - return 'video' - } else if (mimeTyp.includes('wasm') || ext === 'wasm') { - return 'wasm' - } else if (mimeTyp.includes('html') || ['html', 'htm'].includes(ext)) { - return 'html' // Catch 'text/html' mime type - } else if (mimeTyp.includes('text')) { - return 'text' // Put 'text' LAST because it's often misused, so ext should take precedence - } else { - return 'other' - } -} catch (e) { - return 'other' -} -'''; - -CREATE TEMP FUNCTION getFormat(prettyTyp STRING, mimeTyp STRING, ext STRING) -RETURNS STRING -LANGUAGE js AS ''' -try { - if (prettyTyp === 'image') { - // Order by most popular first. - const imageTypes = ['jpg', 'png', 'gif', 'webp', 'svg', 'ico', 'avif', 'jxl', 'heic', 'heif']; - for (const typ of imageTypes) { - if (mimeTyp.includes(typ) || typ === ext) { - return typ - } - } - if (mimeTyp.includes('jpeg')) { - return 'jpg' - } - } - - if (prettyTyp === 'video') { - // Order by most popular first. - const videoTypes = ['flash', 'swf', 'mp4', 'flv', 'f4v'] - for (const typ of videoTypes) { - if (mimeTyp.includes(typ) || typ === ext) { - return typ - } - } - } - - return '' -} catch (e) { - return '' -} -'''; - -CREATE TEMP FUNCTION parseHeaders(headers JSON) -RETURNS ARRAY> -LANGUAGE js AS ''' - try { - return headers.map(header => { - return { name: header.name.toLowerCase(), value: header.value } - }) - } catch (e) { - return [] - } -'''; - -CREATE TEMP FUNCTION getCookieLen(headers JSON, cookieName STRING) -RETURNS INT64 -LANGUAGE js AS ''' - try { - const cookies = headers.filter(header => header.name.toLowerCase() === headerName) - if (!cookies) { - return 0 - } else if (Array.isArray(cookies)) { - const MAX_INT64 = 2 ** 63 - 600 - // Get the cookie length value - const cookieValue = cookies.values().reduce((acc, cookie) => acc + cookie.value.length, 0) - - return Math.min(cookieValue, MAX_INT64) - } else { - return 0 - } - } catch (e) { - return 0 // Return 0 in case of any errors - } -'''; - -CREATE TEMP FUNCTION getExpAge(startedDateTime STRING, responseHeaders JSON) -RETURNS INT64 -LANGUAGE js AS r''' - try { - const cacheControlRegExp = /max-age=(\\d+)/ - const MAX_INT64 = 2 ** 63 - 600 - - // Get the Cache-Control header value - const cacheControl = responseHeaders.find(header => header.name.toLowerCase() === 'cache-control')?.value - - // Handle no-cache scenarios - if (cacheControl && (cacheControl.includes('must-revalidate') || cacheControl.includes('no-cache') || cacheControl.includes('no-store'))) { - return 0 - } else if (cacheControl && cacheControlRegExp.test(cacheControl)) { - // Handle max-age directive in Cache-Control header - const maxAgeValue = parseInt(cacheControlRegExp.exec(cacheControl)[1]) - return Math.min(MAX_INT64, maxAgeValue) - } - - // Handle Expires header in the response - const expiresHeader = responseHeaders.find(header => header.name.toLowerCase() === 'expires')?.value - if (expiresHeader) { - const respDate = responseHeaders.find(header => header.name.toLowerCase() === 'date')?.value - const startDate = new Date(respDate)?.getTime() || Date.parse(startedDateTime) - const endDate = new Date(expiresHeader)?.getTime() || 0 - - // Calculate the difference in seconds, cap within INT64 range - const diffSeconds = Math.max((endDate - startDate) / 1000, 0) - return Math.min(MAX_INT64, diffSeconds) - } - - return 0 - } catch (e) { - return 0 // Return 0 in case of any errors - } -'''; - -INSERT INTO crawl.requests -SELECT - DATE('${iteration.date}') AS date, - '${iteration.client}' AS client, - requests.page AS page, - TRUE AS is_root_page, - requests.page AS root_page, - COALESCE( - crux.rank, - CASE - WHEN summary_pages.rank = 0 THEN NULL - WHEN summary_pages.rank <= 1000 THEN 1000 - WHEN summary_pages.rank <= 5000 THEN 5000 - WHEN summary_pages.rank <= 10000 THEN 10000 - WHEN summary_pages.rank <= 50000 THEN 50000 - WHEN summary_pages.rank <= 100000 THEN 100000 - WHEN summary_pages.rank <= 500000 THEN 500000 - WHEN summary_pages.rank <= 1000000 THEN 1000000 - WHEN summary_pages.rank <= 5000000 THEN 5000000 - WHEN summary_pages.rank <= 10000000 THEN 10000000 - WHEN summary_pages.rank <= 50000000 THEN 50000000 - ELSE NULL - END - ) AS rank, - requests.url AS url, - IF( - SAFE.STRING(payload._request_type) = 'Document' AND - MIN(index) OVER (PARTITION BY requests.page) = index, - TRUE, - FALSE - ) AS is_main_document, - type, - index, - payload, - TO_JSON( STRUCT( - payload.time, - payload._method AS method, - response.url AS redirectUrl, - IFNULL(SAFE.STRING(payload._protocol), SAFE.STRING(request.httpVersion)) AS reqHttpVersion, - request.headersSize AS reqHeadersSize, - request.bodySize AS reqBodySize, - getCookieLen(request.headers, 'cookie') AS reqCookieLen, - response.status, - response.httpVersion AS respHttpVersion, - response.headersSize AS respHeadersSize, - response.bodySize AS respBodySize, - response.content.size AS respSize, - getCookieLen(response.headers, 'set-cookie') AS respCookieLen, - getExpAge(SAFE.STRING(payload.startedDateTime), response.headers) AS expAge, - response.content.mimeType, - payload._cdn_provider, - payload._gzip_save, - ext, - getFormat(type, SAFE.STRING(response.content.mimeType), ext) AS format - )) AS summary, - parseHeaders(request.headers) AS request_headers, - parseHeaders(response.headers) AS response_headers, - IF(requests.type = 'image', NULL, response_bodies.response_body) AS response_body -FROM ( - FROM \`requests.${constants.fnDateUnderscored(iteration.date)}_${iteration.client}\` ${constants.devTABLESAMPLE} - |> SET payload = SAFE.PARSE_JSON(payload, wide_number_mode => 'round') - |> EXTEND getExtFromURL(url) AS ext - |> EXTEND prettyType(SAFE.STRING(payload.response.content.mimeType), ext) AS type - |> EXTEND SAFE.INT64(payload._index) AS index - |> EXTEND payload.request AS request - |> EXTEND payload.response AS response - |> SET payload = JSON_REMOVE( - payload, - '$._headers', - '$.request.headers', - '$.response.headers' - ) -) AS requests - -LEFT JOIN ( - SELECT DISTINCT - CONCAT(origin, '/') AS page, - experimental.popularity.rank AS rank - FROM ${ctx.resolve('chrome-ux-report', 'experimental', 'global')} - WHERE yyyymm = ${constants.fnPastMonth(iteration.date).substring(0, 7).replace('-', '')} -) AS crux -ON requests.page = crux.page - -LEFT JOIN ( - SELECT DISTINCT - url, - rank - FROM summary_pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} ${constants.devTABLESAMPLE} -) AS summary_pages -ON requests.page = summary_pages.url - -LEFT JOIN ( - SELECT - page, - url, - ANY_VALUE(body) AS response_body - FROM response_bodies.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} - GROUP BY page, url -) AS response_bodies ${constants.devTABLESAMPLE} -ON requests.page = response_bodies.page - AND requests.url = response_bodies.url; - `) -}) diff --git a/definitions/extra/migration/backfill_summary_pages.js b/definitions/extra/migration/backfill_summary_pages.js deleted file mode 100644 index adbb3c03..00000000 --- a/definitions/extra/migration/backfill_summary_pages.js +++ /dev/null @@ -1,221 +0,0 @@ -const iterations = [] -const clients = constants.clients - -let midMonth -for ( - let date = '2011-06-01'; - date >= '2011-06-01'; - date = constants.fnPastMonth(date) -) { - clients.forEach((client) => { - if ( - (date === '2013-12-01' && client === 'mobile') - ) { return true } else { - iterations.push({ - date, - client - }) - } - }) - - midMonth = new Date(date) - midMonth.setDate(15) - midMonth = midMonth.toISOString().substring(0, 10) - - clients.forEach((client) => { - if ( - (midMonth === '2014-06-15' && client === 'mobile') || - (midMonth === '2013-07-15') - ) { return true } else { - iterations.push({ - date: midMonth, - client - }) - } - }) -} - -function summaryObject (date) { - let list = '' - if (date >= '2010-11-15') { - list += `fullyLoaded, - bytesCSS, - bytesFlash, - bytesFont, - bytesGif, - bytesHtml, - bytesHtmlDoc, - bytesImg, - bytesJpg, - bytesJS, - bytesJson, - bytesOther, - bytesPng, - bytesTotal, - cdn, - gzipSavings, - gzipTotal, - maxage0, - maxage1, - maxage30, - maxage365, - maxageMore, - maxageNull, - maxDomainReqs, - numCompressed, - numDomains, - numDomElements, - numErrors, - numGlibs, - numHttps, - numRedirects, - onContentLoaded, - onLoad, - renderStart, - reqCSS, - reqFlash, - reqFont, - reqGif, - reqHtml, - reqImg, - reqJpg, - reqJS, - reqJson, - reqOther, - reqPng, - reqTotal, - SpeedIndex, - TTFB, - visualComplete` - } - if (date >= '2014-05-15') { - list += `, - _connections` - } - if (date >= '2015-05-01') { - list += `, - bytesAudio, - bytesSvg, - bytesText, - bytesVideo, - bytesWebp, - bytesXml, - reqAudio, - reqSvg, - reqText, - reqVideo, - reqWebp, - reqXml` - } - return list -} - -function customMetrics (date) { - let list = '' - if (date >= '2014-06-01' && date !== '2014-05-15') { - list += `avg_dom_depth, - doctype, - document_height, - document_width, - localstorage_size, - meta_viewport, - num_iframes, - num_scripts, - sessionstorage_size` - } - if (date >= '2015-11-01') { - list += `, - num_scripts_async, - num_scripts_sync` - } - return list -} - -iterations.forEach((iteration, i) => { - operate(`backfill_summary_pages ${iteration.date} ${iteration.client}`).tags([ - 'backfill_summary_pages' - ]).dependencies([ - i === 0 ? 'backfill' : `backfill_summary_pages ${iterations[i - 1].date} ${iterations[i - 1].client}` - ]).queries(ctx => ` -DELETE FROM crawl.pages -WHERE date = '${iteration.date}' - AND client = '${iteration.client}'; - -INSERT INTO crawl.pages -SELECT - DATE('${iteration.date}') AS date, - '${iteration.client}' AS client, - url AS page, - TRUE AS is_root_page, - url AS root_page, - CASE - WHEN rank = 0 THEN NULL - WHEN rank<=1000 THEN 1000 - WHEN rank<=5000 THEN 5000 - WHEN rank<=10000 THEN 10000 - WHEN rank<=50000 THEN 50000 - WHEN rank<=100000 THEN 100000 - WHEN rank<=500000 THEN 500000 - WHEN rank<=1000000 THEN 1000000 - WHEN rank<=5000000 THEN 5000000 - WHEN rank<=10000000 THEN 10000000 - WHEN rank<=50000000 THEN 50000000 - ELSE NULL - END AS rank, - wptid, - NULL AS payload, - TO_JSON( STRUCT( - ${summaryObject(iteration.date)} - )) AS summary, - STRUCT< - a11y JSON, - cms JSON, - cookies JSON, - css_variables JSON, - ecommerce JSON, - element_count JSON, - javascript JSON, - markup JSON, - media JSON, - origin_trials JSON, - performance JSON, - privacy JSON, - responsive_images JSON, - robots_txt JSON, - security JSON, - structured_data JSON, - third_parties JSON, - well_known JSON, - wpt_bodies JSON, - other JSON - >( - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - TO_JSON( STRUCT( - ${customMetrics(iteration.date)} - )) - ) AS custom_metrics, - NULL AS lighthouse, - NULL AS features, - NULL AS technologies, - NULL AS metadata -FROM summary_pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} ${constants.devTABLESAMPLE}; - `) -}) diff --git a/definitions/extra/migration/backfill_summary_requests.js b/definitions/extra/migration/backfill_summary_requests.js deleted file mode 100644 index e5a5bf0d..00000000 --- a/definitions/extra/migration/backfill_summary_requests.js +++ /dev/null @@ -1,277 +0,0 @@ -const iterations = [] -const clients = constants.clients - -let midMonth -for ( - let date = '2011-06-01'; - date >= '2011-06-01'; - date = constants.fnPastMonth(date) -) { - clients.forEach((client) => { - if ( - (date === '2015-09-01' && client === 'mobile') || - (date === '2015-06-01' && client === 'mobile') || - (date === '2013-12-01') - ) { return true } else { - iterations.push({ - date, - client - }) - } - }) - - midMonth = new Date(date) - midMonth.setDate(15) - midMonth = midMonth.toISOString().substring(0, 10) - - clients.forEach((client) => { - if ( - (midMonth === '2014-06-15' && client === 'mobile') || - (midMonth === '2013-07-15') - ) { return true } else { - iterations.push({ - date: midMonth, - client - }) - } - }) -} - -function summaryObject (date) { - let list = '' - if (date >= '2010-11-15') { - list += ` - expAge, - method, - mimeType, - redirectUrl, - reqBodySize, - reqCookieLen, - reqHeadersSize, - respBodySize, - respCookieLen, - respHeadersSize, - respHttpVersion, - respSize, - status, - time` - } - if (date >= '2014-05-15') { - list += `, - _cdn_provider` - } - if (date >= '2014-06-01') { - list += `, - _gzip_save` - } - if (date >= '2015-05-01') { - list += `, - format` - } - return list -} - -iterations.forEach((iteration, i) => { - operate(`backfill_summary_requests ${iteration.date} ${iteration.client}`).tags([ - 'backfill_summary_requests' - ]).dependencies([ - i === 0 ? 'backfill' : `backfill_summary_requests ${iterations[i - 1].date} ${iterations[i - 1].client}` - ]).queries(ctx => ` -DELETE FROM crawl.requests -WHERE date = '${iteration.date}' AND client = '${iteration.client}'; - -CREATE TEMP FUNCTION get_ext_from_url(url STRING) -RETURNS STRING -LANGUAGE js -AS """ - try { - let ret_ext = url; - - // Remove query parameters - const i_q = ret_ext.indexOf("?"); - if (i_q > -1) { - ret_ext = ret_ext.substring(0, i_q); - } - - // Get the last segment of the path after the last "/" - ret_ext = ret_ext.substring(ret_ext.lastIndexOf("/") + 1); - - // Find the position of the last dot - const i_dot = ret_ext.lastIndexOf("."); - - if (i_dot === -1) { - // No dot means no extension - ret_ext = ""; - } else { - // Extract the extension - ret_ext = ret_ext.substring(i_dot + 1); - - // Weed out overly long extensions - if (ret_ext.length > 5) { - ret_ext = ""; - } - } - - return ret_ext.toLowerCase(); - } catch (e) { - return ""; // Return an empty string in case of any errors - } -"""; - -CREATE TEMP FUNCTION get_type(mime_typ STRING, ext STRING) -RETURNS STRING -LANGUAGE js -AS """ - try { - mime_typ = mime_typ.toLowerCase(); - - // Order by most unique types first - const uniqueTypes = ["font", "css", "image", "script", "video", "audio", "xml"]; - for (let typ of uniqueTypes) { - if (mime_typ.includes(typ)) { - return typ; - } - } - - // Special cases - if (mime_typ.includes("json") || ["js", "json"].includes(ext)) { - return "script"; - } else if (["eot", "ttf", "woff", "woff2", "otf"].includes(ext)) { - return "font"; - } else if ( - ["png", "gif", "jpg", "jpeg", "webp", "ico", "svg", "avif", "jxl", "heic", "heif"].includes(ext) - ) { - return "image"; - } else if (ext === "css") { - return "css"; - } else if (ext === "xml") { - return "xml"; - } else if ( - ["mp4", "webm", "ts", "m4v", "m4s", "mov", "ogv", "swf", "f4v", "flv"].includes(ext) || - ["flash", "webm", "mp4", "flv"].some(typ => mime_typ.includes(typ)) - ) { - return "video"; - } else if (mime_typ.includes("wasm") || ext === "wasm") { - return "wasm"; - } else if (mime_typ.includes("html") || ["html", "htm"].includes(ext)) { - return "html"; - } else if (mime_typ.includes("text")) { - // Put "text" last because it is often misused, so extension should take precedence. - return "text"; - } else { - return "other"; - } - } catch (e) { - return "other"; // Return "other" if there's any error - } -"""; - -CREATE TEMP FUNCTION parse_headers(headers STRING) -RETURNS ARRAY> -LANGUAGE js -AS r""" - if (!headers) return [] - - try { - const parsedHeaders = headers.split(/,\\s/).map(header => { - const [name, ...valueParts] = header.split(/\\s=\\s/); - if (name && valueParts.length > 0) { - return { name: name.trim(), value: valueParts.join('=').trim() }; - } - return null; - }); - - return parsedHeaders.filter(Boolean); - } catch (e) { - return []; - } -"""; - -INSERT INTO crawl.requests -SELECT - DATE('${iteration.date}') AS date, - '${iteration.client}' AS client, - pages.url AS page, - TRUE AS is_root_page, - pages.url AS root_page, - CASE - WHEN rank = 0 THEN NULL - WHEN rank<=1000 THEN 1000 - WHEN rank<=5000 THEN 5000 - WHEN rank<=10000 THEN 10000 - WHEN rank<=50000 THEN 50000 - WHEN rank<=100000 THEN 100000 - WHEN rank<=500000 THEN 500000 - WHEN rank<=1000000 THEN 1000000 - WHEN rank<=5000000 THEN 5000000 - WHEN rank<=10000000 THEN 10000000 - WHEN rank<=50000000 THEN 50000000 - ELSE NULL - END AS rank, - requests.url AS url, - requests.firstHTML AS is_main_document, - get_type(requests.mimeType, requests.ext_from_url) AS type, - IF(requests.firstReq, 1, NULL) AS index, - NULL AS payload, - TO_JSON( STRUCT( - ext_from_url AS ext, - ${summaryObject(iteration.date)} - )) AS summary, - ARRAY_CONCAT( - ARRAY>[ - ('Accept', requests.req_accept), - ("Accept-Charset", requests.req_accept_charset), - ("Accept-Encoding", requests.req_accept_encoding), - ("Accept-Language", requests.req_accept_language), - ("Connection", requests.req_connection), - ("Host", requests.req_host), - ("If-Modified-Since", requests.req_if_modified_since), - ("If-None-Match", requests.req_if_none_match), - ("Referer", requests.req_referer), - ("User-Agent", requests.req_user_agent) - ], - parse_headers(requests.reqOtherHeaders) - ) AS request_headers, - ARRAY_CONCAT( - ARRAY>[ - ("Accept-Ranges", requests.resp_accept_ranges), - ("Age", requests.resp_age), - ("Cache-Control", requests.resp_cache_control), - ("Connection", requests.resp_connection), - ("Content-Encoding", requests.resp_content_encoding), - ("Content-Language", requests.resp_content_language), - ("Content-Length", requests.resp_content_length), - ("Content-Location", requests.resp_content_location), - ("Content-Type", requests.resp_content_type), - ("Date", requests.resp_date), - ("ETag", requests.resp_etag), - ("Expires", requests.resp_expires), - ("Keep-Alive", requests.resp_keep_alive), - ("Last-Modified", requests.resp_last_modified), - ("Location", requests.resp_location), - ("Pragma", requests.resp_pragma), - ("Server", requests.resp_server), - ("Transfer-Encoding", requests.resp_transfer_encoding), - ("Vary", requests.resp_vary), - ("Via", requests.resp_via), - ("X-Powered-By", requests.resp_x_powered_by) - ], - parse_headers(requests.respOtherHeaders) - ) AS response_headers, - NULL AS response_body -FROM ( - SELECT - *, - get_ext_from_url(url) AS ext_from_url - FROM summary_requests.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} ${constants.devTABLESAMPLE} -) AS requests -LEFT JOIN ( - SELECT DISTINCT - url, - pageid, - rank - FROM summary_pages.${constants.fnDateUnderscored(iteration.date)}_${iteration.client} ${constants.devTABLESAMPLE} -) AS pages -ON requests.pageid = pages.pageid; - `) -}) diff --git a/definitions/extra/migration/reprocess_pages.js b/definitions/extra/migration/reprocess_pages.js deleted file mode 100644 index 91d8f144..00000000 --- a/definitions/extra/migration/reprocess_pages.js +++ /dev/null @@ -1,223 +0,0 @@ -operate('reprocess') - -const iterations = [] -const clients = constants.clients - -for ( - let month = '2022-03-01'; month >= '2022-03-01'; month = constants.fnPastMonth(month)) { - clients.forEach((client) => { - iterations.push({ - month, - client - }) - }) -} - -iterations.forEach((iteration, i) => { - operate(`reprocess_pages ${iteration.month} ${iteration.client}`).tags([ - 'reprocess_pages' - ]).dependencies([ - i === 0 ? 'reprocess' : `reprocess_pages ${iterations[i - 1].month} ${iterations[i - 1].client}` - ]).queries(ctx => ` -DELETE FROM crawl.pages -WHERE date = '${iteration.month}' AND - client = '${iteration.client}'; - -INSERT INTO crawl.pages -SELECT - date, - client, - page, - is_root_page, - root_page, - rank, - wptid, - JSON_REMOVE( - payload, - '$._metadata', - '$._detected', - '$._detected_apps', - '$._detected_technologies', - '$._detected_raw', - '$._custom', - '$._00_reset', - '$._a11y', - '$._ads', - '$._almanac', - '$._aurora', - '$._avg_dom_depth', - '$._cms', - '$._Colordepth', - '$._cookies', - '$._crawl_links', - '$._css-variables', - '$._css', - '$._doctype', - '$._document_height', - '$._document_width', - '$._Dpi', - '$._ecommerce', - '$._element_count', - '$._event-names', - '$._fugu-apis', - '$._generated-content', - '$._has_shadow_root', - '$._Images', - '$._img-loading-attr', - '$._initiators', - '$._inline_style_bytes', - '$._javascript', - '$._lib-detector-version', - '$._localstorage_size', - '$._markup', - '$._media', - '$._meta_viewport', - '$._num_iframes', - '$._num_scripts_async', - '$._num_scripts_sync', - '$._num_scripts', - '$._observers', - '$._origin-trials', - '$._parsed_css', - '$._performance', - '$._privacy-sandbox', - '$._privacy', - '$._pwa', - '$._quirks_mode', - '$._Resolution', - '$._responsive_images', - '$._robots_meta', - '$._robots_txt', - '$._sass', - '$._security', - '$._sessionstorage_size', - '$._structured-data', - '$._third-parties', - '$._usertiming', - '$._valid-head', - '$._well-known', - '$._wpt_bodies', - '$._blinkFeatureFirstUsed', - '$._CrUX' - ) AS payload, - JSON_SET( - JSON_REMOVE( - summary, - '$._adult_site', - '$.archive', - '$.avg_dom_depth', - '$.crawlid', - '$.createDate', - '$.doctype', - '$.document_height', - '$.document_width', - '$.label', - '$.localstorage_size', - '$.meta_viewport', - '$.metadata', - '$.num_iframes', - '$.num_scripts_async', - '$.num_scripts_sync', - '$.num_scripts', - '$.pageid', - '$.PageSpeed', - '$.rank', - '$.sessionstorage_size', - '$.startedDateTime', - '$.url', - '$.urlhash', - '$.urlShort', - '$.usertiming', - '$.wptid', - '$.wptrun' - ), - '$.crux', - payload._CrUX - ) AS summary, - STRUCT< - a11y JSON, - cms JSON, - cookies JSON, - css_variables JSON, - ecommerce JSON, - element_count JSON, - javascript JSON, - markup JSON, - media JSON, - origin_trials JSON, - performance JSON, - privacy JSON, - responsive_images JSON, - robots_txt JSON, - security JSON, - structured_data JSON, - third_parties JSON, - well_known JSON, - wpt_bodies JSON, - other JSON - >( - custom_metrics.a11y, - custom_metrics.cms, - custom_metrics.cookies, - custom_metrics["css-variables"], - custom_metrics.ecommerce, - custom_metrics.element_count, - custom_metrics.javascript, - custom_metrics.markup, - custom_metrics.media, - custom_metrics["origin-trials"], - custom_metrics.performance, - custom_metrics.privacy, - custom_metrics.responsive_images, - custom_metrics.robots_txt, - custom_metrics.security, - custom_metrics["structured-data"], - custom_metrics["third-parties"], - custom_metrics["well-known"], - custom_metrics.wpt_bodies, - JSON_REMOVE( - custom_metrics, - '$.a11y', - '$.cms', - '$.cookies', - '$.css-variables', - '$.ecommerce', - '$.element_count', - '$.javascript', - '$.markup', - '$.media', - '$.origin-trials', - '$.performance', - '$.privacy', - '$.responsive_images', - '$.robots_txt', - '$.security', - '$.structured-data', - '$.third-parties', - '$.well-known', - '$.wpt_bodies' - ) - ) AS custom_metrics, - lighthouse, - features, - technologies, - JSON_REMOVE( - metadata, - '$.page_id', - '$.parent_page_id', - '$.root_page_id' - ) AS metadata -FROM ( - SELECT - * EXCEPT (custom_metrics, lighthouse, metadata, payload, summary), - SAFE.PARSE_JSON(custom_metrics, wide_number_mode => 'round') AS custom_metrics, - SAFE.PARSE_JSON(lighthouse, wide_number_mode => 'round') AS lighthouse, - SAFE.PARSE_JSON(metadata, wide_number_mode => 'round') AS metadata, - SAFE.PARSE_JSON(payload, wide_number_mode => 'round') AS payload, - SAFE.PARSE_JSON(summary, wide_number_mode => 'round') AS summary - FROM \`all.pages\` - WHERE date = '${iteration.month}' AND - client = '${iteration.client}' ${constants.devRankFilter} -); - `) -}) diff --git a/definitions/extra/migration/reprocess_requests.js b/definitions/extra/migration/reprocess_requests.js deleted file mode 100644 index 6e20494b..00000000 --- a/definitions/extra/migration/reprocess_requests.js +++ /dev/null @@ -1,99 +0,0 @@ -const iterations = [] - -for ( - let month = '2022-03-01'; month >= '2022-03-01'; month = constants.fnPastMonth(month)) { - constants.clients.forEach((client) => { - constants.booleans.forEach((isRootPage) => { - iterations.push({ - month, - client, - isRootPage - }) - }) - }) -} - -iterations.forEach((iteration, i) => { - operate(`reprocess_requests ${iteration.month} ${iteration.client} ${iteration.isRootPage}`).tags( - ['reprocess_requests'] - ).dependencies([ - i === 0 ? 'reprocess' : `reprocess_requests ${iterations[i - 1].month} ${iterations[i - 1].client} ${iterations[i - 1].isRootPage}` - ]).queries(ctx => ` -DELETE FROM crawl.requests -WHERE date = '${iteration.month}' - AND client = '${iteration.client}' - AND is_root_page = ${iteration.isRootPage}; - -CREATE TEMP FUNCTION pruneHeaders( - jsonObject JSON -) RETURNS JSON -LANGUAGE js AS ''' -try { - for (const [key, value] of Object.entries(jsonObject)) { - if(key.startsWith('req_') || key.startsWith('resp_')) { - delete jsonObject[key]; - } - } - return jsonObject; -} catch (e) { - return jsonObject; -} -'''; - -INSERT INTO crawl.requests -SELECT - date, - client, - requests.page, - is_root_page, - root_page, - crux.rank, - url, - is_main_document, - type, - index, - JSON_REMOVE( - payload, - '$._headers', - '$.request.headers', - '$.response.headers' - ) AS payload, - pruneHeaders( - JSON_REMOVE( - summary, - '$.crawlid', - '$.firstHtml', - '$.firstReq', - '$.pageid', - '$.reqOtherHeaders', - '$.requestid', - '$.respOtherHeaders', - '$.startedDateTime', - '$.type', - '$.url', - '$.urlShort' - ) - ) as summary, - request_headers, - response_headers, - response_body -FROM ( - SELECT - * EXCEPT (payload, summary), - SAFE.PARSE_JSON(payload, wide_number_mode => 'round') AS payload, - SAFE.PARSE_JSON(summary, wide_number_mode => 'round') AS summary - FROM \`all.requests\` ${constants.devTABLESAMPLE} - WHERE date = '${iteration.month}' - AND client = '${iteration.client}' - AND is_root_page = ${iteration.isRootPage} -) AS requests -LEFT JOIN ( - SELECT DISTINCT - CONCAT(origin, '/') AS page, - experimental.popularity.rank AS rank - FROM ${ctx.resolve('chrome-ux-report', 'experimental', 'global')} - WHERE yyyymm = ${constants.fnPastMonth(iteration.month).substring(0, 7).replace('-', '')} -) AS crux -ON requests.root_page = crux.page; - `) -}) diff --git a/definitions/extra/test_env.js b/definitions/extra/test_env.js deleted file mode 100644 index c9c4b034..00000000 --- a/definitions/extra/test_env.js +++ /dev/null @@ -1,28 +0,0 @@ -const date = constants.currentMonth -operate('test') - -// List of resources to be copied to the test environment. Comment out the ones you don't need. -const resourcesList = [ - { datasetId: 'all', tableId: 'pages', filter: `date = '${date}'` }, - { datasetId: 'all', tableId: 'requests', filter: `date = '${date}'` }, - { datasetId: 'all', tableId: 'parsed_css', filter: `date = '${date}'` }, - { datasetId: 'core_web_vitals', tableId: 'technologies', filter: `date = '${date}'` }, - { datasetId: 'blink_features', tableId: 'usage', filter: `yyyymmdd = '${date}'` }, - { datasetId: 'blink_features', tableId: 'features', filter: `yyyymmdd = '${date}'` } -] - -// Copying the resources to the test environment. Using views instead of tables to avoid processing and speed things up. -// Prefixes and suffixes hardcoded in the query for the sake of safety. -resourcesList.forEach(resource => { - operate( - `test_table ${resource.datasetId}_dev_dev_${resource.tableId}` - ).dependencies(['test']).queries(` -CREATE SCHEMA IF NOT EXISTS ${resource.datasetId}_dev; -DROP TABLE IF EXISTS ${resource.datasetId}_dev.dev_${resource.tableId}; - -CREATE VIEW IF NOT EXISTS ${resource.datasetId}_dev.dev_${resource.tableId} AS -SELECT * -FROM \`${resource.datasetId}.${resource.tableId}\` ${constants.devTABLESAMPLE} -WHERE ${resource.filter} - `) -}) diff --git a/definitions/output/all/pages.js b/definitions/output/all/pages.js deleted file mode 100644 index e5b3490c..00000000 --- a/definitions/output/all/pages.js +++ /dev/null @@ -1,45 +0,0 @@ -publish('pages', { - type: 'incremental', - protected: true, - schema: 'all', - bigquery: { - partitionBy: 'date', - clusterBy: ['client', 'is_root_page', 'rank'], - requirePartitionFilter: true - }, - tags: ['crawl_results_legacy'] -}).preOps(ctx => ` -DELETE FROM ${ctx.self()} -WHERE date = '${constants.currentMonth}'; -`).query(ctx => ` -SELECT * -FROM ${ctx.ref('crawl_staging', 'pages')} -WHERE date = '${constants.currentMonth}' - AND client = 'desktop' - AND is_root_page = TRUE - ${constants.devRankFilter} -`).postOps(ctx => ` -INSERT INTO ${ctx.self()} -SELECT * -FROM ${ctx.ref('crawl_staging', 'pages')} -WHERE date = '${constants.currentMonth}' - AND client = 'desktop' - AND is_root_page = FALSE - ${constants.devRankFilter}; - -INSERT INTO ${ctx.self()} -SELECT * -FROM ${ctx.ref('crawl_staging', 'pages')} ${constants.devTABLESAMPLE} -WHERE date = '${constants.currentMonth}' - AND client = 'mobile' - AND is_root_page = TRUE - ${constants.devRankFilter}; - -INSERT INTO ${ctx.self()} -SELECT * -FROM ${ctx.ref('crawl_staging', 'pages')} -WHERE date = '${constants.currentMonth}' - AND client = 'mobile' - AND is_root_page = FALSE - ${constants.devRankFilter}; -`) diff --git a/definitions/output/all/parsed_css.js b/definitions/output/all/parsed_css.js deleted file mode 100644 index fcf9f480..00000000 --- a/definitions/output/all/parsed_css.js +++ /dev/null @@ -1,16 +0,0 @@ -publish('parsed_css', { - type: 'incremental', - protected: true, - schema: 'all', - bigquery: { - partitionBy: 'date', - clusterBy: ['client', 'is_root_page', 'rank', 'page'], - requirePartitionFilter: true - }, - tags: ['crawl_results_legacy'] -}).preOps(ctx => ` -DROP SNAPSHOT TABLE IF EXISTS ${ctx.self()}; - -CREATE SNAPSHOT TABLE ${ctx.self()} -CLONE ${ctx.ref('crawl', 'parsed_css')}; -`) diff --git a/definitions/output/all/requests.js b/definitions/output/all/requests.js deleted file mode 100644 index 9e50b0ed..00000000 --- a/definitions/output/all/requests.js +++ /dev/null @@ -1,53 +0,0 @@ -publish('requests', { - type: 'incremental', - protected: true, - schema: 'all', - bigquery: { - partitionBy: 'date', - clusterBy: ['client', 'is_root_page', 'is_main_document', 'type'], - requirePartitionFilter: true - }, - tags: ['crawl_results_legacy'] -}).preOps(ctx => ` -DELETE FROM ${ctx.self()} -WHERE date = '${constants.currentMonth}'; -`).query(ctx => ` -SELECT * EXCEPT (rank) -FROM ${ctx.ref('crawl_staging', 'requests')} ${constants.devTABLESAMPLE} -WHERE date = '${constants.currentMonth}' AND client = 'desktop' AND is_root_page = TRUE AND type = 'script' -`).postOps(ctx => ` -INSERT INTO ${ctx.self()} -SELECT * EXCEPT (rank) -FROM ${ctx.ref('crawl_staging', 'requests')} ${constants.devTABLESAMPLE} -WHERE date = '${constants.currentMonth}' AND client = 'desktop' AND is_root_page = TRUE AND (type != 'script' OR type IS NULL); - -INSERT INTO ${ctx.self()} -SELECT * EXCEPT (rank) -FROM ${ctx.ref('crawl_staging', 'requests')} ${constants.devTABLESAMPLE} -WHERE date = '${constants.currentMonth}' AND client = 'desktop' AND is_root_page = FALSE AND type = 'script'; - -INSERT INTO ${ctx.self()} -SELECT * EXCEPT (rank) -FROM ${ctx.ref('crawl_staging', 'requests')} ${constants.devTABLESAMPLE} -WHERE date = '${constants.currentMonth}' AND client = 'desktop' AND is_root_page = FALSE AND (type != 'script' OR type IS NULL); - -INSERT INTO ${ctx.self()} -SELECT * EXCEPT (rank) -FROM ${ctx.ref('crawl_staging', 'requests')} ${constants.devTABLESAMPLE} -WHERE date = '${constants.currentMonth}' AND client = 'mobile' AND is_root_page = TRUE AND type = 'script'; - -INSERT INTO ${ctx.self()} -SELECT * EXCEPT (rank) -FROM ${ctx.ref('crawl_staging', 'requests')} ${constants.devTABLESAMPLE} -WHERE date = '${constants.currentMonth}' AND client = 'mobile' AND is_root_page = TRUE AND (type != 'script' OR type IS NULL); - -INSERT INTO ${ctx.self()} -SELECT * EXCEPT (rank) -FROM ${ctx.ref('crawl_staging', 'requests')} ${constants.devTABLESAMPLE} -WHERE date = '${constants.currentMonth}' AND client = 'mobile' AND is_root_page = FALSE AND type = 'script'; - -INSERT INTO ${ctx.self()} -SELECT * EXCEPT (rank) -FROM ${ctx.ref('crawl_staging', 'requests')} ${constants.devTABLESAMPLE} -WHERE date = '${constants.currentMonth}' AND client = 'mobile' AND is_root_page = FALSE AND (type != 'script' OR type IS NULL) -`) diff --git a/definitions/output/lighthouse.js b/definitions/output/lighthouse.js deleted file mode 100644 index 210654b2..00000000 --- a/definitions/output/lighthouse.js +++ /dev/null @@ -1,20 +0,0 @@ -const currentMonthUnderscored = constants.fnDateUnderscored(constants.currentMonth) - -constants.clients.forEach(client => { - publish(`${currentMonthUnderscored}_${client}`, { - type: 'table', - schema: 'lighthouse', - tags: ['crawl_results_legacy'] - }).query(ctx => ` -SELECT - page AS url, - lighthouse AS report -FROM ${ctx.ref('all', 'pages')} -WHERE - date = '${constants.currentMonth}' - AND client = '${client}' - AND is_root_page - AND lighthouse IS NOT NULL - AND LENGTH(lighthouse) <= 2 * 1024 * 1024 -- legacy tables have a different limit - `) -}) diff --git a/definitions/output/pages.js b/definitions/output/pages.js deleted file mode 100644 index f9507069..00000000 --- a/definitions/output/pages.js +++ /dev/null @@ -1,19 +0,0 @@ -const currentMonthUnderscored = constants.fnDateUnderscored(constants.currentMonth) - -constants.clients.forEach(client => { - publish(`${currentMonthUnderscored}_${client}`, { - type: 'table', - schema: 'pages', - tags: ['crawl_results_legacy'] - }).query(ctx => ` -SELECT - page AS url, - payload -FROM ${ctx.ref('all', 'pages')} -WHERE date = '${constants.currentMonth}' AND - client = '${client}' AND - is_root_page AND - payload IS NOT NULL AND - LENGTH(payload) <= 2 * 1024 * 1024 -- legacy tables have a different limit - `) -}) diff --git a/definitions/output/requests.js b/definitions/output/requests.js deleted file mode 100644 index ca93ea2a..00000000 --- a/definitions/output/requests.js +++ /dev/null @@ -1,21 +0,0 @@ -const currentMonthUnderscored = constants.fnDateUnderscored(constants.currentMonth) - -constants.clients.forEach(client => { - publish(`${currentMonthUnderscored}_${client}`, { - type: 'table', - schema: 'requests', - tags: ['crawl_results_legacy'] - }).query(ctx => ` -SELECT - page, - url, - payload -FROM ${ctx.ref('all', 'requests')} -WHERE date = '${constants.currentMonth}' AND - client = '${client}' AND - is_root_page AND - payload IS NOT NULL AND - LENGTH(payload) <= 2 * 1024 * 1024 AND -- legacy tables have a different limit - SAFE.PARSE_JSON(payload) IS NOT NULL - `) -}) diff --git a/definitions/output/response_bodies.js b/definitions/output/response_bodies.js deleted file mode 100644 index f52b91d4..00000000 --- a/definitions/output/response_bodies.js +++ /dev/null @@ -1,20 +0,0 @@ -const currentMonthUnderscored = constants.fnDateUnderscored(constants.currentMonth) - -constants.clients.forEach(client => { - publish(`${currentMonthUnderscored}_${client}`, { - type: 'table', - schema: 'response_bodies', - tags: ['crawl_results_legacy'] - }).query(ctx => ` -SELECT - page, - url, - SUBSTRING(response_body, 0, 2 * 1024 * 1024) AS response_body, - LENGTH(response_body) >= 2 * 1024 * 1024 AS truncated -FROM ${ctx.ref('all', 'requests')} -WHERE date = '${constants.currentMonth}' AND - client = '${client}' AND - is_root_page AND - response_body IS NOT NULL - `) -}) diff --git a/definitions/output/summary_pages.js b/definitions/output/summary_pages.js deleted file mode 100644 index 63b3e995..00000000 --- a/definitions/output/summary_pages.js +++ /dev/null @@ -1,108 +0,0 @@ -const currentMonthUnderscored = constants.fnDateUnderscored(constants.currentMonth) - -constants.clients.forEach(client => { - publish(`${currentMonthUnderscored}_${client}`, { - type: 'table', - schema: 'summary_pages', - tags: ['crawl_results_legacy'] - }).query(ctx => ` -SELECT - SAFE_CAST(JSON_EXTRACT_SCALAR(METADATA, '$.page_id') AS INTEGER) AS pageid, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.createDate') AS INTEGER) AS createDate, - JSON_EXTRACT_SCALAR(summary, '$.archive') AS archive, - JSON_EXTRACT_SCALAR(summary, '$.label') AS label, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.crawlid') AS INTEGER) AS crawlid, - JSON_EXTRACT_SCALAR(summary, '$.wptid') AS wptid, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.wptrun') AS INTEGER) AS wptrun, - JSON_EXTRACT_SCALAR(summary, '$.url') AS url, - JSON_EXTRACT_SCALAR(summary, '$.urlShort') AS urlShort, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.urlhash') AS INTEGER) AS urlhash, - JSON_EXTRACT_SCALAR(summary, '$.cdn') AS cdn, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.startedDateTime') AS INTEGER) AS startedDateTime, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.TTFB') AS INTEGER) AS TTFB, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.renderStart') AS INTEGER) AS renderStart, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.onContentLoaded') AS INTEGER) AS onContentLoaded, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.onLoad') AS INTEGER) AS onLoad, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.fullyLoaded') AS INTEGER) AS fullyLoaded, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.visualComplete') AS INTEGER) AS visualComplete, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.PageSpeed') AS INTEGER) AS PageSpeed, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.SpeedIndex') AS INTEGER) AS SpeedIndex, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.rank') AS INTEGER) AS rank, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqTotal') AS INTEGER) AS reqTotal, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqHtml') AS INTEGER) AS reqHtml, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqJS') AS INTEGER) AS reqJS, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqCss') AS INTEGER) AS reqCSS, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqImg') AS INTEGER) AS reqImg, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqGif') AS INTEGER) AS reqGif, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqJpg') AS INTEGER) AS reqJpg, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqPng') AS INTEGER) AS reqPng, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqFont') AS INTEGER) AS reqFont, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqFlash') AS INTEGER) AS reqFlash, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqJson') AS INTEGER) AS reqJson, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqOther') AS INTEGER) AS reqOther, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesTotal') AS INTEGER) AS bytesTotal, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesHtml') AS INTEGER) AS bytesHtml, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesJS') AS INTEGER) AS bytesJS, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesCss') AS INTEGER) AS bytesCSS, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesImg') AS INTEGER) AS bytesImg, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesGif') AS INTEGER) AS bytesGif, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesJpg') AS INTEGER) AS bytesJpg, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesPng') AS INTEGER) AS bytesPng, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesFont') AS INTEGER) AS bytesFont, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesFlash') AS INTEGER) AS bytesFlash, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesJson') AS INTEGER) AS bytesJson, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesOther') AS INTEGER) AS bytesOther, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesHtmlDoc') AS INTEGER) AS bytesHtmlDoc, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.numDomains') AS INTEGER) AS numDomains, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.maxDomainReqs') AS INTEGER) AS maxDomainReqs, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.numRedirects') AS INTEGER) AS numRedirects, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.numErrors') AS INTEGER) AS numErrors, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.numGlibs') AS INTEGER) AS numGlibs, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.numHttps') AS INTEGER) AS numHttps, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.numCompressed') AS INTEGER) AS numCompressed, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.numDomElements') AS INTEGER) AS numDomElements, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.maxageNull') AS INTEGER) AS maxageNull, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.maxage0') AS INTEGER) AS maxage0, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.maxage1') AS INTEGER) AS maxage1, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.maxage30') AS INTEGER) AS maxage30, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.maxage365') AS INTEGER) AS maxage365, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.maxageMore') AS INTEGER) AS maxageMore, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.gzipTotal') AS INTEGER) AS gzipTotal, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.gzipSavings') AS INTEGER) AS gzipSavings, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$._connections') AS INTEGER) AS _connections, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$._adult_site') AS BOOLEAN) AS _adult_site, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.avg_dom_depth') AS INTEGER) AS avg_dom_depth, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.document_height') AS INTEGER) AS document_height, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.document_width') AS INTEGER) AS document_width, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.localstorage_size') AS INTEGER) AS localstorage_size, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.sessionstorage_size') AS INTEGER) AS sessionstorage_size, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.num_iframes') AS INTEGER) AS num_iframes, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.num_scripts') AS INTEGER) AS num_scripts, - JSON_EXTRACT_SCALAR(summary, '$.doctype') AS doctype, - JSON_EXTRACT_SCALAR(summary, '$.meta_viewport') AS meta_viewport, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqAudio') AS INTEGER) AS reqAudio, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqVideo') AS INTEGER) AS reqVideo, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqText') AS INTEGER) AS reqText, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqXml') AS INTEGER) AS reqXml, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqWebp') AS INTEGER) AS reqWebp, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqSvg') AS INTEGER) AS reqSvg, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesAudio') AS INTEGER) AS bytesAudio, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesVideo') AS INTEGER) AS bytesVideo, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesText') AS INTEGER) AS bytesText, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesXml') AS INTEGER) AS bytesXml, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesWebp') AS INTEGER) AS bytesWebp, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.bytesSvg') AS INTEGER) AS bytesSvg, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.num_scripts_async') AS INTEGER) AS num_scripts_async, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.num_scripts_sync') AS INTEGER) AS num_scripts_sync, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.usertiming') AS INTEGER) AS usertiming, - metadata -FROM ${ctx.ref('all', 'pages')} -WHERE - date = '${constants.currentMonth}' AND - client = '${client}' AND - is_root_page AND - summary IS NOT NULL AND - JSON_EXTRACT_SCALAR(metadata, '$.page_id') IS NOT NULL AND - JSON_EXTRACT_SCALAR(metadata, '$.page_id') != '' - `) -}) diff --git a/definitions/output/summary_requests.js b/definitions/output/summary_requests.js deleted file mode 100644 index 3a758aaf..00000000 --- a/definitions/output/summary_requests.js +++ /dev/null @@ -1,80 +0,0 @@ -const currentMonthUnderscored = constants.fnDateUnderscored(constants.currentMonth) - -constants.clients.forEach(client => { - publish(`${currentMonthUnderscored}_${client}`, { - type: 'table', - schema: 'summary_requests', - tags: ['crawl_results_legacy'] - }).query(ctx => ` -SELECT - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.requestid') AS INTEGER) AS requestid, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.pageid') AS INTEGER) AS pageid, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.startedDateTime') AS INTEGER) AS startedDateTime, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.time') AS INTEGER) AS time, - JSON_EXTRACT_SCALAR(summary, '$.method') AS method, - JSON_EXTRACT_SCALAR(summary, '$.url') AS url, - JSON_EXTRACT_SCALAR(summary, '$.urlShort') AS urlShort, - JSON_EXTRACT_SCALAR(summary, '$.redirectUrl') AS redirectUrl, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.firstReq') AS BOOLEAN) AS firstReq, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.firstHtml') AS BOOLEAN) AS firstHtml, - JSON_EXTRACT_SCALAR(summary, '$.reqHttpVersion') AS reqHttpVersion, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqHeadersSize') AS INTEGER) AS reqHeadersSize, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqBodySize') AS INTEGER) AS reqBodySize, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.reqCookieLen') AS INTEGER) AS reqCookieLen, - JSON_EXTRACT_SCALAR(summary, '$.reqOtherHeaders') AS reqOtherHeaders, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.status') AS INTEGER) AS status, - JSON_EXTRACT_SCALAR(summary, '$.respHttpVersion') AS respHttpVersion, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.respHeadersSize') AS INTEGER) AS respHeadersSize, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.respBodySize') AS INTEGER) AS respBodySize, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.respSize') AS INTEGER) AS respSize, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.respCookieLen') AS INTEGER) AS respCookieLen, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.expAge') AS INTEGER) AS expAge, - JSON_EXTRACT_SCALAR(summary, '$.mimeType') AS mimeType, - JSON_EXTRACT_SCALAR(summary, '$.respOtherHeaders') AS respOtherHeaders, - JSON_EXTRACT_SCALAR(summary, '$.req_accept') AS req_accept, - JSON_EXTRACT_SCALAR(summary, '$.req_accept_charset') AS req_accept_charset, - JSON_EXTRACT_SCALAR(summary, '$.req_accept_encoding') AS req_accept_encoding, - JSON_EXTRACT_SCALAR(summary, '$.req_accept_language') AS req_accept_language, - JSON_EXTRACT_SCALAR(summary, '$.req_connection') AS req_connection, - JSON_EXTRACT_SCALAR(summary, '$.req_host') AS req_host, - JSON_EXTRACT_SCALAR(summary, '$.req_if_modified_since') AS req_if_modified_since, - JSON_EXTRACT_SCALAR(summary, '$.req_if_none_match') AS req_if_none_match, - JSON_EXTRACT_SCALAR(summary, '$.req_referer') AS req_referer, - JSON_EXTRACT_SCALAR(summary, '$.req_user_agent') AS req_user_agent, - JSON_EXTRACT_SCALAR(summary, '$.resp_accept_ranges') AS resp_accept_ranges, - JSON_EXTRACT_SCALAR(summary, '$.resp_age') AS resp_age, - JSON_EXTRACT_SCALAR(summary, '$.resp_cache_control') AS resp_cache_control, - JSON_EXTRACT_SCALAR(summary, '$.resp_connection') AS resp_connection, - JSON_EXTRACT_SCALAR(summary, '$.resp_content_encoding') AS resp_content_encoding, - JSON_EXTRACT_SCALAR(summary, '$.resp_content_language') AS resp_content_language, - JSON_EXTRACT_SCALAR(summary, '$.resp_content_length') AS resp_content_length, - JSON_EXTRACT_SCALAR(summary, '$.resp_content_location') AS resp_content_location, - JSON_EXTRACT_SCALAR(summary, '$.resp_content_type') AS resp_content_type, - JSON_EXTRACT_SCALAR(summary, '$.resp_date') AS resp_date, - JSON_EXTRACT_SCALAR(summary, '$.resp_etag') AS resp_etag, - JSON_EXTRACT_SCALAR(summary, '$.resp_expires') AS resp_expires, - JSON_EXTRACT_SCALAR(summary, '$.resp_keep_alive') AS resp_keep_alive, - JSON_EXTRACT_SCALAR(summary, '$.resp_last_modified') AS resp_last_modified, - JSON_EXTRACT_SCALAR(summary, '$.resp_location') AS resp_location, - JSON_EXTRACT_SCALAR(summary, '$.resp_pragma') AS resp_pragma, - JSON_EXTRACT_SCALAR(summary, '$.resp_server') AS resp_server, - JSON_EXTRACT_SCALAR(summary, '$.resp_transfer_encoding') AS resp_transfer_encoding, - JSON_EXTRACT_SCALAR(summary, '$.resp_vary') AS resp_vary, - JSON_EXTRACT_SCALAR(summary, '$.resp_via') AS resp_via, - JSON_EXTRACT_SCALAR(summary, '$.resp_x_powered_by') AS resp_x_powered_by, - JSON_EXTRACT_SCALAR(summary, '$._cdn_provider') AS _cdn_provider, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$._gzip_save') AS INTEGER) AS _gzip_save, - SAFE_CAST(JSON_EXTRACT_SCALAR(summary, '$.crawlid') AS INTEGER) AS crawlid, - JSON_EXTRACT_SCALAR(summary, '$.type') AS type, - JSON_EXTRACT_SCALAR(summary, '$.ext') AS ext, - JSON_EXTRACT_SCALAR(summary, '$.format') AS format, -FROM ${ctx.ref('all', 'requests')} -WHERE - date = '${constants.currentMonth}' AND - client = '${client}' AND - is_root_page AND - summary IS NOT NULL AND - JSON_EXTRACT_SCALAR(summary, '$.requestid') IS NOT NULL AND - JSON_EXTRACT_SCALAR(summary, '$.requestid') != '' - `) -}) diff --git a/definitions/output/technologies.js b/definitions/output/technologies.js deleted file mode 100644 index 3bcb3533..00000000 --- a/definitions/output/technologies.js +++ /dev/null @@ -1,24 +0,0 @@ -const currentMonthUnderscored = constants.fnDateUnderscored(constants.currentMonth) - -constants.clients.forEach(client => { - publish(`${currentMonthUnderscored}_${client}`, { - type: 'table', - schema: 'technologies', - tags: ['crawl_results_legacy'] - }).query(ctx => ` -SELECT DISTINCT - page as url, - category, - tech.technology AS app, - info -FROM ${ctx.ref('all', 'pages')}, -UNNEST (technologies) AS tech, -UNNEST (tech.categories) AS category, -UNNEST (tech.info) AS info -WHERE date = '${constants.currentMonth}' AND - client = '${client}' AND - is_root_page - ${constants.devRankFilter} AND - tech.technology IS NOT NULL - `) -}) diff --git a/workflow_settings.yaml b/workflow_settings.yaml index a5476912..924cba4e 100644 --- a/workflow_settings.yaml +++ b/workflow_settings.yaml @@ -1,6 +1,6 @@ defaultProject: httparchive defaultLocation: US -defaultDataset: all +defaultDataset: crawl defaultAssertionDataset: dataform_assertions vars: From a33ce4bfb4ef6cf650d963e300e2a28139f10004 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Wed, 20 Nov 2024 17:09:27 +0100 Subject: [PATCH 2/3] remove tag trigger --- infra/dataform-trigger/index.js | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/infra/dataform-trigger/index.js b/infra/dataform-trigger/index.js index bf5c96a1..bd298545 100644 --- a/infra/dataform-trigger/index.js +++ b/infra/dataform-trigger/index.js @@ -41,8 +41,7 @@ FROM crux, report; repoName: 'crawl-data', tags: [ 'crawl_complete', - 'blink_features_report', - 'crawl_results_legacy' + 'blink_features_report' ] } } From 39d6ff064589c148840ad022e7ed7488de785a29 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Wed, 27 Nov 2024 22:05:11 +0100 Subject: [PATCH 3/3] dependency --- definitions/output/crawl/requests.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/definitions/output/crawl/requests.js b/definitions/output/crawl/requests.js index d53584db..1656dd63 100644 --- a/definitions/output/crawl/requests.js +++ b/definitions/output/crawl/requests.js @@ -108,7 +108,7 @@ LEFT JOIN ( SELECT DISTINCT CONCAT(origin, '/') AS page, experimental.popularity.rank AS rank - FROM ${ctx.resolve('chrome-ux-report', 'experimental', 'global')} + FROM ${ctx.ref('chrome-ux-report', 'experimental', 'global')} WHERE yyyymm = ${constants.fnPastMonth(constants.currentMonth).substring(0, 7).replace('-', '')} ) AS crux ON requests.root_page = crux.page @@ -168,7 +168,7 @@ LEFT JOIN ( SELECT DISTINCT CONCAT(origin, '/') AS page, experimental.popularity.rank AS rank - FROM ${ctx.resolve('chrome-ux-report', 'experimental', 'global')} + FROM ${ctx.ref('chrome-ux-report', 'experimental', 'global')} WHERE yyyymm = ${constants.fnPastMonth(constants.currentMonth).substring(0, 7).replace('-', '')} ) AS crux ON requests.root_page = crux.page;