diff --git a/package-lock.json b/package-lock.json index 766fefefb..86423de95 100644 --- a/package-lock.json +++ b/package-lock.json @@ -62388,6 +62388,7 @@ "chatbot-server-mongodb-public": "*", "csv": "^6.3.1", "dotenv": "^16.3.1", + "ingest-mongodb-public": "*", "mongodb-chatbot-server": "*", "mongodb-rag-core": "*", "yaml": "^2.3.4", diff --git a/packages/ingest-mongodb-public/package.json b/packages/ingest-mongodb-public/package.json index f9d9cb76b..843455a41 100644 --- a/packages/ingest-mongodb-public/package.json +++ b/packages/ingest-mongodb-public/package.json @@ -9,6 +9,9 @@ "node": ">=18", "npm": ">=8" }, + "module": "./build/index.js", + "main": "./build/index.js", + "types": "./build/index.d.ts", "scripts": { "preinstall": "npx playwright install chromium --with-deps", "clean": "rm -rf build", diff --git a/packages/ingest-mongodb-public/src/index.ts b/packages/ingest-mongodb-public/src/index.ts new file mode 100644 index 000000000..7ccab975d --- /dev/null +++ b/packages/ingest-mongodb-public/src/index.ts @@ -0,0 +1 @@ +export * from "./sources"; diff --git a/packages/ingest-mongodb-public/src/modules.d.ts b/packages/ingest-mongodb-public/src/modules.d.ts deleted file mode 100644 index e5ea7e8d6..000000000 --- a/packages/ingest-mongodb-public/src/modules.d.ts +++ /dev/null @@ -1 +0,0 @@ -declare module "turndown-plugin-gfm"; diff --git a/packages/ingest-mongodb-public/src/sources/index.ts b/packages/ingest-mongodb-public/src/sources/index.ts index cfe4d5a06..4930a447a 100644 --- a/packages/ingest-mongodb-public/src/sources/index.ts +++ b/packages/ingest-mongodb-public/src/sources/index.ts @@ -27,9 +27,6 @@ import { MakeMongoDbUniversityDataSourceParams, makeMongoDbUniversityDataSource, } from "./mongodb-university"; -const { DEVCENTER_CONNECTION_URI, UNIVERSITY_DATA_API_KEY } = assertEnvVars( - PUBLIC_INGEST_ENV_VARS -); import { getUrlsFromSitemap, initialWebSources, @@ -38,6 +35,13 @@ import { } from "./mongodbDotCom"; import { chromium } from "playwright"; +const { DEVCENTER_CONNECTION_URI, UNIVERSITY_DATA_API_KEY } = assertEnvVars( + PUBLIC_INGEST_ENV_VARS +); + +export { snootyProjectConfig }; +export * from "./snooty/SnootyDataSource"; + /** Async constructor for specific data sources -- parameters baked in. */ diff --git a/packages/ingest-mongodb-public/src/sources/snootySources.ts b/packages/ingest-mongodb-public/src/sources/snootySources.ts index 8ef857e01..07721cb96 100644 --- a/packages/ingest-mongodb-public/src/sources/snootySources.ts +++ b/packages/ingest-mongodb-public/src/sources/snootySources.ts @@ -103,8 +103,14 @@ export const snootyProjectConfig: LocallySpecifiedSnootyProjectConfig[] = [ { type: "snooty", name: "mck", - tags: ["docs", "kubernetes", "k8s", "kubernetes-controllers", "kubernetes-operator"], - productName: "MongoDB Controllers for Kubernetes" + tags: [ + "docs", + "kubernetes", + "k8s", + "kubernetes-controllers", + "kubernetes-operator", + ], + productName: "MongoDB Controllers for Kubernetes", }, { type: "snooty", diff --git a/packages/ingest-mongodb-public/src/turndown-plugin-gfm.d.ts b/packages/ingest-mongodb-public/src/turndown-plugin-gfm.d.ts new file mode 100644 index 000000000..a222730f9 --- /dev/null +++ b/packages/ingest-mongodb-public/src/turndown-plugin-gfm.d.ts @@ -0,0 +1,4 @@ +declare module "turndown-plugin-gfm" { + import { Plugin as TurndownPlugin } from "turndown"; + export function gfm(): TurndownPlugin; +} diff --git a/packages/ingest-mongodb-public/tsconfig.json b/packages/ingest-mongodb-public/tsconfig.json index 596c9977c..b4f32bd83 100644 --- a/packages/ingest-mongodb-public/tsconfig.json +++ b/packages/ingest-mongodb-public/tsconfig.json @@ -3,5 +3,8 @@ "compilerOptions": { "outDir": "./build" }, - "include": ["./src/**/*.ts"] + "include": [ + "./src/**/*.ts", + "./src/**/*.d.ts" + ] } diff --git a/packages/scripts/package.json b/packages/scripts/package.json index df9967154..37ad1909b 100644 --- a/packages/scripts/package.json +++ b/packages/scripts/package.json @@ -25,6 +25,7 @@ "removeTestDatabases": "npm run build && node ./build/removeTestDatabases.js", "getConversationText": "npm run build && node ./build/getConversationText.js", "findPageTitles": "npm run build && node ./build/main/findPageTitlesMain.js", + "findUningestedDocsSites": "npm run build && node ./build/findUningestedDocsSites.js", "listSlackMessages": "npm run build && node ./build/main/listSlackMessagesMain.js", "removeSlackMessage": "npm run build && node ./build/main/removeSlackMessageMain.js", "checkUrlsAgainstDB": "npm run build && node ./build/checkUrlsAgainstDB.js", @@ -48,6 +49,7 @@ "mongodb-chatbot-server": "*", "mongodb-rag-core": "*", "chatbot-server-mongodb-public": "*", + "ingest-mongodb-public": "*", "yaml": "^2.3.4", "yargs": "^17.7.2" }, diff --git a/packages/scripts/src/findUningestedDocsSites.ts b/packages/scripts/src/findUningestedDocsSites.ts new file mode 100644 index 000000000..72c781973 --- /dev/null +++ b/packages/scripts/src/findUningestedDocsSites.ts @@ -0,0 +1,74 @@ +import { type SnootyProject, snootyProjectConfig } from "ingest-mongodb-public"; + +const deprecatedProjectNames = [ + "atlas-app-services", + "atlas-open-service-broker", + "datalake", + "guides", + "realm", +]; + +const omittedProjects = [ + ...deprecatedProjectNames.map((name) => ({ + name, + deprecated: true, + note: "Deprecated", + })), + { + name: "mongoid-railsmdb", + deprecated: false, + note: "Supposed to be a repo for a new docset but the project got deprioritized so all that's in there right now is a (potentially outdated) Getting Started guide", + }, +]; + +function getOmittedProject(projectName: string) { + return omittedProjects.find((p) => p.name === projectName); +} + +async function listDocsProjectsFromApi() { + const apiBaseUrl = "https://snooty-data-api.mongodb.com/prod"; + const listProjectsUrl = new URL("projects", apiBaseUrl); + const response = await fetch(listProjectsUrl); + if (!response.ok) { + throw new Error(`Failed to list projects: ${response.statusText}`); + } + const responseBody = await response.json(); + if (!("data" in responseBody)) { + throw new Error("Invalid response body. Received:", responseBody); + } + const apiProjects = responseBody.data as SnootyProject[]; + return apiProjects; +} + +async function findUningestedDocsSites() { + const apiProjects = await listDocsProjectsFromApi(); + const ingestableProjectNames = new Set(apiProjects.map((p) => p.project)); + const ingestedProjectNames = new Set(snootyProjectConfig.map((p) => p.name)); + // A project should be ingested if it's ingestable but not ingested yet + const uningestedProjects = Array.from(ingestableProjectNames) + .filter((x) => !ingestedProjectNames.has(x)) + .reduce( + (acc, projectName) => { + const omitted = getOmittedProject(projectName); + const deprecated = omitted?.deprecated; + if (deprecated) { + acc.deprecated.push(projectName); + } else if (omitted) { + acc.omitted.push(`${omitted.name} :: ${omitted.note}`); + } else { + acc.ingestable.push(projectName); + } + return acc; + }, + { deprecated: [], omitted: [], ingestable: [] } as { + deprecated: string[]; + omitted: string[]; + ingestable: string[]; + } + ); + return uningestedProjects; +} + +findUningestedDocsSites().then((projects) => { + console.log(projects); +}); diff --git a/packages/scripts/src/turndown-plugin-gfm.d.ts b/packages/scripts/src/turndown-plugin-gfm.d.ts new file mode 100644 index 000000000..a222730f9 --- /dev/null +++ b/packages/scripts/src/turndown-plugin-gfm.d.ts @@ -0,0 +1,4 @@ +declare module "turndown-plugin-gfm" { + import { Plugin as TurndownPlugin } from "turndown"; + export function gfm(): TurndownPlugin; +} diff --git a/packages/scripts/tsconfig.json b/packages/scripts/tsconfig.json index 596c9977c..78f0a78b6 100644 --- a/packages/scripts/tsconfig.json +++ b/packages/scripts/tsconfig.json @@ -1,7 +1,11 @@ { "extends": "../../tsconfig.json", "compilerOptions": { + "lib": ["ESNext", "DOM"], "outDir": "./build" }, - "include": ["./src/**/*.ts"] + "include": [ + "./src/**/*.ts", + "./src/**/*.d.ts" + ] }