Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Evals][Experiment] Expose an interim eval endpoint that accepts files. #1176

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 38 additions & 2 deletions genkit-tools/common/src/eval/evaluate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import {
GenerateRequest,
GenerateRequestSchema,
GenerateResponseSchema,
InterimRunNewEvaluationRequest,
MessageData,
RunNewEvaluationRequest,
SpanData,
Expand Down Expand Up @@ -76,14 +77,15 @@ export async function runNewEvaluation(
(d) => d.datasetId === datasetId
);
const datasetVersion = targetDatasetMetadata?.version;
const actionConfig = request.options?.actionConfig;

logger.info('Running inference...');
const evalDataset = await runInference({
manager,
actionRef,
evalFlowInput: EvalInferenceInputSchema.parse({ samples: dataset }),
auth: request.options?.auth,
actionConfig: request.options?.actionConfig,
actionConfig,
});
const evaluatorActions = await getMatchingEvaluatorActions(
manager,
Expand All @@ -94,7 +96,41 @@ export async function runNewEvaluation(
manager,
evaluatorActions,
evalDataset,
augments: { actionRef, datasetId, datasetVersion },
augments: { actionRef, datasetId, datasetVersion, actionConfig },
});
return evalRun.key;
}

/**
* Starts a new evaluation run. Intended to be used via the reflection API.
*/
export async function interimRunNewEvaluation(
manager: RuntimeManager,
request: InterimRunNewEvaluationRequest
): Promise<EvalRunKey> {
const { input, actionRef, evaluators } = request;
const actionConfig = request.options?.actionConfig;
const evaluatorActions = await getMatchingEvaluatorActions(
manager,
evaluators
);

const evalDataset = await runInference({
manager,
actionRef,
evalFlowInput: input,
auth: request.options?.auth,
actionConfig,
});

const evalRun = await runEvaluation({
manager,
evaluatorActions,
evalDataset,
augments: {
actionRef,
actionConfig,
},
});
return evalRun.key;
}
Expand Down
19 changes: 18 additions & 1 deletion genkit-tools/common/src/server/router.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,12 @@
*/
import { initTRPC, TRPCError } from '@trpc/server';
import { z } from 'zod';
import { getDatasetStore, getEvalStore, runNewEvaluation } from '../eval';
import {
getDatasetStore,
getEvalStore,
interimRunNewEvaluation,
runNewEvaluation,
} from '../eval';
import { RuntimeManager } from '../manager/manager';
import { GenkitToolsError } from '../manager/types';
import { Action } from '../types/action';
Expand Down Expand Up @@ -241,6 +246,18 @@ export const TOOLS_SERVER_ROUTER = (manager: RuntimeManager) =>
return response;
}),

/**
* Interim API to start new evaluation run.
*
* Will be deprecated in favor of `runNewEvaluation` once datasets are fully supported in the Dev UI */
interimRunNewEvaluation: loggedProcedure
.input(apis.InterimRunNewEvaluationRequestSchema)
.output(evals.EvalRunKeySchema)
.mutation(async ({ input }) => {
const response = await interimRunNewEvaluation(manager, input);
return response;
}),

/** Send a screen view analytics event */
sendPageView: t.procedure
.input(apis.PageViewSchema)
Expand Down
18 changes: 18 additions & 0 deletions genkit-tools/common/src/types/apis.ts
Original file line number Diff line number Diff line change
Expand Up @@ -137,3 +137,21 @@ export const RunNewEvaluationRequestSchema = z.object({
export type RunNewEvaluationRequest = z.infer<
typeof RunNewEvaluationRequestSchema
>;

export const InterimRunNewEvaluationRequestSchema = z.object({
input: EvalInferenceInputSchema,
actionRef: z.string(),
evaluators: z.array(z.string()).optional(),
options: z
.object({
auth: z.string().optional(),
actionConfig: z
.any()
.describe('addition parameters required for inference')
.optional(),
})
.optional(),
});
export type InterimRunNewEvaluationRequest = z.infer<
typeof InterimRunNewEvaluationRequestSchema
>;
5 changes: 5 additions & 0 deletions genkit-tools/common/src/types/eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,10 @@ export const EvalRunKeySchema = z.object({
actionRef: z.string().optional(),
datasetId: z.string().optional(),
datasetVersion: z.number().optional(),
actionConfig: z
.any()
.describe('addition parameters required for inference')
.optional(),
evalRunId: z.string(),
createdAt: z.string(),
});
Expand All @@ -116,6 +120,7 @@ export const EvalKeyAugmentsSchema = EvalRunKeySchema.pick({
datasetId: true,
datasetVersion: true,
actionRef: true,
actionConfig: true,
});
export type EvalKeyAugments = z.infer<typeof EvalKeyAugmentsSchema>;

Expand Down
2 changes: 1 addition & 1 deletion js/testapps/cat-eval/eval/cat_adoption_qna.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"cases": [
"samples": [
{
"input": "What are typical cat behaviors?",
"reference": "Cats like to purr, push things away and cuddle."
Expand Down
Loading