Skip to content

Commit 69d81aa

Browse files
committed
fix: Move away from JSON outputs to structured outputs. Fixes hoarder-app#1047
1 parent 379c49b commit 69d81aa

File tree

7 files changed

+74
-39
lines changed

7 files changed

+74
-39
lines changed

apps/workers/openaiWorker.ts

+6-3
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ async function inferTagsFromImage(
162162
),
163163
metadata.contentType,
164164
base64,
165-
{ json: true, abortSignal },
165+
{ schema: openAIResponseSchema, abortSignal },
166166
);
167167
}
168168

@@ -235,7 +235,10 @@ async function inferTagsFromPDF(
235235
`Content: ${bookmark.asset.content}`,
236236
serverConfig.inference.contextLength,
237237
);
238-
return inferenceClient.inferFromText(prompt, { json: true, abortSignal });
238+
return inferenceClient.inferFromText(prompt, {
239+
schema: openAIResponseSchema,
240+
abortSignal,
241+
});
239242
}
240243

241244
async function inferTagsFromText(
@@ -244,7 +247,7 @@ async function inferTagsFromText(
244247
abortSignal: AbortSignal,
245248
) {
246249
return await inferenceClient.inferFromText(await buildPrompt(bookmark), {
247-
json: true,
250+
schema: openAIResponseSchema,
248251
abortSignal,
249252
});
250253
}

docs/docs/03-configuration.md

+14-13
Original file line numberDiff line numberDiff line change
@@ -48,19 +48,20 @@ Either `OPENAI_API_KEY` or `OLLAMA_BASE_URL` need to be set for automatic taggin
4848
- You might want to tune the `INFERENCE_CONTEXT_LENGTH` as the default is quite small. The larger the value, the better the quality of the tags, but the more expensive the inference will be (money-wise on OpenAI and resource-wise on ollama).
4949
:::
5050

51-
| Name | Required | Default | Description |
52-
| --------------------------- | -------- | ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
53-
| OPENAI_API_KEY | No | Not set | The OpenAI key used for automatic tagging. More on that in [here](/openai). |
54-
| OPENAI_BASE_URL | No | Not set | If you just want to use OpenAI you don't need to pass this variable. If, however, you want to use some other openai compatible API (e.g. azure openai service), set this to the url of the API. |
55-
| OLLAMA_BASE_URL | No | Not set | If you want to use ollama for local inference, set the address of ollama API here. |
56-
| OLLAMA_KEEP_ALIVE | No | Not set | Controls how long the model will stay loaded into memory following the request (example value: "5m"). |
57-
| INFERENCE_TEXT_MODEL | No | gpt-4o-mini | The model to use for text inference. You'll need to change this to some other model if you're using ollama. |
58-
| INFERENCE_IMAGE_MODEL | No | gpt-4o-mini | The model to use for image inference. You'll need to change this to some other model if you're using ollama and that model needs to support vision APIs (e.g. llava). |
59-
| EMBEDDING_TEXT_MODEL | No | text-embedding-3-small | The model to be used for generating embeddings for the text. |
60-
| INFERENCE_CONTEXT_LENGTH | No | 2048 | The max number of tokens that we'll pass to the inference model. If your content is larger than this size, it'll be truncated to fit. The larger this value, the more of the content will be used in tag inference, but the more expensive the inference will be (money-wise on openAI and resource-wise on ollama). Check the model you're using for its max supported content size. |
61-
| INFERENCE_LANG | No | english | The language in which the tags will be generated. |
62-
| INFERENCE_JOB_TIMEOUT_SEC | No | 30 | How long to wait for the inference job to finish before timing out. If you're running ollama without powerful GPUs, you might want to increase the timeout a bit. |
63-
| INFERENCE_FETCH_TIMEOUT_SEC | No | 300 | \[Ollama Only\] The timeout of the fetch request to the ollama server. If your inference requests take longer than the default 5mins, you might want to increase this timeout. |
51+
| Name | Required | Default | Description |
52+
| ------------------------------------ | -------- | ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
53+
| OPENAI_API_KEY | No | Not set | The OpenAI key used for automatic tagging. More on that in [here](/openai). |
54+
| OPENAI_BASE_URL | No | Not set | If you just want to use OpenAI you don't need to pass this variable. If, however, you want to use some other openai compatible API (e.g. azure openai service), set this to the url of the API. |
55+
| OLLAMA_BASE_URL | No | Not set | If you want to use ollama for local inference, set the address of ollama API here. |
56+
| OLLAMA_KEEP_ALIVE | No | Not set | Controls how long the model will stay loaded into memory following the request (example value: "5m"). |
57+
| INFERENCE_TEXT_MODEL | No | gpt-4o-mini | The model to use for text inference. You'll need to change this to some other model if you're using ollama. |
58+
| INFERENCE_IMAGE_MODEL | No | gpt-4o-mini | The model to use for image inference. You'll need to change this to some other model if you're using ollama and that model needs to support vision APIs (e.g. llava). |
59+
| EMBEDDING_TEXT_MODEL | No | text-embedding-3-small | The model to be used for generating embeddings for the text. |
60+
| INFERENCE_CONTEXT_LENGTH | No | 2048 | The max number of tokens that we'll pass to the inference model. If your content is larger than this size, it'll be truncated to fit. The larger this value, the more of the content will be used in tag inference, but the more expensive the inference will be (money-wise on openAI and resource-wise on ollama). Check the model you're using for its max supported content size. |
61+
| INFERENCE_LANG | No | english | The language in which the tags will be generated. |
62+
| INFERENCE_JOB_TIMEOUT_SEC | No | 30 | How long to wait for the inference job to finish before timing out. If you're running ollama without powerful GPUs, you might want to increase the timeout a bit. |
63+
| INFERENCE_FETCH_TIMEOUT_SEC | No | 300 | \[Ollama Only\] The timeout of the fetch request to the ollama server. If your inference requests take longer than the default 5mins, you might want to increase this timeout. |
64+
| INFERENCE_SUPPORTS_STRUCTURED_OUTPUT | No | true | Whether the inference model supports structured output or not. |
6465

6566
:::info
6667

packages/shared/config.ts

+2
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ const allEnv = z.object({
2727
INFERENCE_IMAGE_MODEL: z.string().default("gpt-4o-mini"),
2828
EMBEDDING_TEXT_MODEL: z.string().default("text-embedding-3-small"),
2929
INFERENCE_CONTEXT_LENGTH: z.coerce.number().default(2048),
30+
INFERENCE_SUPPORTS_STRUCTURED_OUTPUT: stringBool("true"),
3031
OCR_CACHE_DIR: z.string().optional(),
3132
OCR_LANGS: z
3233
.string()
@@ -94,6 +95,7 @@ const serverConfigSchema = allEnv.transform((val) => {
9495
imageModel: val.INFERENCE_IMAGE_MODEL,
9596
inferredTagLang: val.INFERENCE_LANG,
9697
contextLength: val.INFERENCE_CONTEXT_LENGTH,
98+
supportsStructuredOutput: val.INFERENCE_SUPPORTS_STRUCTURED_OUTPUT,
9799
},
98100
embedding: {
99101
textModel: val.EMBEDDING_TEXT_MODEL,

packages/shared/inference.ts

+21-9
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
import { Ollama } from "ollama";
22
import OpenAI from "openai";
3+
import { zodResponseFormat } from "openai/helpers/zod";
4+
import { z } from "zod";
5+
import { zodToJsonSchema } from "zod-to-json-schema";
36

47
import serverConfig from "./config";
58
import { customFetch } from "./customFetch";
@@ -15,12 +18,13 @@ export interface EmbeddingResponse {
1518
}
1619

1720
export interface InferenceOptions {
18-
json: boolean;
21+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
22+
schema: z.ZodSchema<any> | null;
1923
abortSignal?: AbortSignal;
2024
}
2125

2226
const defaultInferenceOptions: InferenceOptions = {
23-
json: true,
27+
schema: null,
2428
};
2529

2630
export interface InferenceClient {
@@ -72,9 +76,11 @@ class OpenAIInferenceClient implements InferenceClient {
7276
{
7377
messages: [{ role: "user", content: prompt }],
7478
model: serverConfig.inference.textModel,
75-
response_format: optsWithDefaults.json
76-
? { type: "json_object" }
77-
: undefined,
79+
response_format:
80+
optsWithDefaults.schema &&
81+
serverConfig.inference.supportsStructuredOutput
82+
? zodResponseFormat(optsWithDefaults.schema, "schema")
83+
: undefined,
7884
},
7985
{
8086
signal: optsWithDefaults.abortSignal,
@@ -101,9 +107,11 @@ class OpenAIInferenceClient implements InferenceClient {
101107
const chatCompletion = await this.openAI.chat.completions.create(
102108
{
103109
model: serverConfig.inference.imageModel,
104-
response_format: optsWithDefaults.json
105-
? { type: "json_object" }
106-
: undefined,
110+
response_format:
111+
optsWithDefaults.schema &&
112+
serverConfig.inference.supportsStructuredOutput
113+
? zodResponseFormat(optsWithDefaults.schema, "schema")
114+
: undefined,
107115
messages: [
108116
{
109117
role: "user",
@@ -178,7 +186,11 @@ class OllamaInferenceClient implements InferenceClient {
178186
}
179187
const chatCompletion = await this.ollama.chat({
180188
model: model,
181-
format: optsWithDefaults.json ? "json" : undefined,
189+
format:
190+
optsWithDefaults.schema &&
191+
serverConfig.inference.supportsStructuredOutput
192+
? zodToJsonSchema(optsWithDefaults.schema)
193+
: undefined,
182194
stream: true,
183195
keep_alive: serverConfig.inference.ollamaKeepAlive,
184196
options: {

packages/shared/package.json

+4-3
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,12 @@
88
"glob": "^11.0.0",
99
"liteque": "^0.3.2",
1010
"meilisearch": "^0.37.0",
11-
"ollama": "^0.5.9",
12-
"openai": "^4.67.1",
11+
"ollama": "^0.5.14",
12+
"openai": "^4.86.1",
1313
"typescript-parsec": "^0.3.4",
1414
"winston": "^3.11.0",
15-
"zod": "^3.22.4"
15+
"zod": "^3.22.4",
16+
"zod-to-json-schema": "^3.24.3"
1617
},
1718
"devDependencies": {
1819
"@hoarder/eslint-config": "workspace:^0.2.0",

packages/trpc/routers/bookmarks.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -1109,7 +1109,7 @@ Content: ${bookmark.content ?? ""}
11091109
);
11101110

11111111
const summary = await inferenceClient.inferFromText(summaryPrompt, {
1112-
json: false,
1112+
schema: null,
11131113
});
11141114

11151115
if (!summary.response) {

pnpm-lock.yaml

+26-10
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)