Merge pull request #6 from cloudflare/mhart/update-retrieval-plugin-i…

…nstructions Clarify example-retrieval-plugin instructions
cloudflare · May 14, 2023 · 9281c27 · 9281c27
2 parents 461e723 + 27152b4
commit 9281c27
Show file tree

Hide file tree

Showing 5 changed files with 33 additions and 19 deletions.
diff --git a/example-retrieval-plugin/README.md b/example-retrieval-plugin/README.md
@@ -14,7 +14,7 @@ so that relevant text can be retrieved quickly when a query comes through. These
 ## Overview
 
 You'll need an [OpenAI API Key](https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety) to run this.
-The (optional) scheduler Worker also needs a [token to access GitHub's GraphQL API](https://docs.github.com/en/graphql/guides/forming-calls-with-graphql#authenticating-with-a-personal-access-token-classic),
+The (optional) scheduler Worker also needs a [token to access GitHub's GraphQL API](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token#creating-a-personal-access-token-classic),
 and [Queues enabled on the Cloudflare account](https://developers.cloudflare.com/queues/).
 
 The plugin Worker responds to the `/query` endpoint – as well as returning the JSON manifests needed for the plugin.
@@ -52,23 +52,31 @@ npm run dev
 ### Deploying
 
 ```
+cd plugin
+
 npx wrangler kv:namespace create retrieval
 ```
 
 Then update the KV `id` (under `[[kv_namespaces]]`) in `plugin/wrangler.toml` – leave the `binding` name as-is
 
+Then you'll need to enter your `OPENAI_API_KEY` (you'll be prompted to paste it)
+
 ```
 npx wrangler secret put OPENAI_API_KEY
 ```
 
-Finally:
+And finally:
 
 ```
-npm run deploy
+npx wrangler publish
 ```
 
+(or `npm run deploy` from the same directory as this README)
+
 ## Refreshing the embeddings manually
 
+Before you can query anything, you'll need to populate the embeddings vector store, which you can do manually with:
+
 ```
 OPENAI_API_KEY="sk-..." KV_NAMESPACE_ID="d03..." npm run refresh
 ```
@@ -102,23 +110,28 @@ npm run dev:scheduler
 
 ### Deploying
 
-```
-npx wrangler queues create embeddings-resolver
-```
+Update the KV `id` (under `[[kv_namespaces]]`) in `scheduler/wrangler.toml`
+to match the same as the Plugin Worker – leave the `binding` name as-is
 
-```
-npx wrangler kv:namespace create retrieval
-```
+You'll need an [OpenAI API Key](https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety), a
+[token to access GitHub's GraphQL API](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token#creating-a-personal-access-token-classic)
+(it needs a "classic" token, but you won't need any specific privileges if you're just reading from a public repo,
+you can leave them all unchecked), and [Queues enabled on the Cloudflare account](https://developers.cloudflare.com/queues/).
 
-Then update the KV `id` (under `[[kv_namespaces]]`) in `scheduler/wrangler.toml` – leave the `binding` name as-is
+Then you can create the resources needed for the scheduler worker:
 
 ```
+cd scheduler
+
+npx wrangler queues create embeddings-resolver
 npx wrangler secret put OPENAI_API_KEY
 npx wrangler secret put GITHUB_API_KEY
 ```
 
-Finally:
+And finally:
 
 ```
-npm run deploy:scheduler
+npx wrangler publish
 ```
+
+(or `npm run deploy:scheduler` from the same directory as this README)
diff --git a/example-retrieval-plugin/scheduler/wrangler.toml b/example-retrieval-plugin/scheduler/wrangler.toml
@@ -2,9 +2,10 @@ name = "scheduled-retrieval"
 main = "src/index.js"
 compatibility_date = "2023-04-07"
 usage_model = "unbound"
+workers_dev = false
 
 [triggers]
-crons = [ "1 * * * *" ]
+crons = [ "1 * * * *" ] # Every hour at 1 minute past
 
 [[kv_namespaces]]
 binding = "KV"

diff --git a/example-retrieval-plugin/scripts/chunk.js b/example-retrieval-plugin/scripts/chunk.js
@@ -35,8 +35,8 @@ const chunks = filesWithContents
   .map(({ path, oid, text }) => textToChunks(path, oid, text))
   .flat();
 
-console.error("Number of files: ", filesWithContents.length);
-console.error("Number of chunks: ", chunks.length);
+console.warn("Number of files: ", filesWithContents.length);
+console.warn("Number of chunks: ", chunks.length);
 
 writeFileSync(
   "chunks.bulk.json",

diff --git a/example-retrieval-plugin/scripts/embeddings.js b/example-retrieval-plugin/scripts/embeddings.js
@@ -25,8 +25,8 @@ async function main() {
 
   const vectorCollection = VectorCollection.from(chunkEmbeddings);
 
-  console.error("Number of embeddings:", vectorCollection.length);
-  console.error("Embedding length:", vectorCollection.embeddingLength);
+  console.warn("Number of embeddings:", vectorCollection.length);
+  console.warn("Embedding length:", vectorCollection.embeddingLength);
 
   writeFileSync("embeddings.bin", new Uint8Array(vectorCollection.buffer));
 }
diff --git a/example-retrieval-plugin/shared/docs.js b/example-retrieval-plugin/shared/docs.js
@@ -37,7 +37,7 @@ export function textToChunks(filePath, fileId, markdownText) {
   let title = (markdownText.match(/^title: (.+)$/m) ?? [])[1];
 
   if (title == null) {
-    console.error("No title:", filePath);
+    console.warn("No title:", filePath);
     return [];
   }
 
@@ -52,7 +52,7 @@ export function textToChunks(filePath, fileId, markdownText) {
     .trim();
 
   if (markdownText.length < 100) {
-    console.error("Too short:", filePath);
+    console.warn("Too short:", filePath);
     return [];
   }