firebase · ssbushi · Nov 1, 2024 · Nov 4, 2024 · Nov 5, 2024 · Nov 8, 2024
diff --git a/genkit-tools/common/src/eval/evaluate.ts b/genkit-tools/common/src/eval/evaluate.ts
@@ -29,6 +29,7 @@ import {
   GenerateRequest,
   GenerateRequestSchema,
   GenerateResponseSchema,
+  InterimRunNewEvaluationRequest,
   MessageData,
   RunNewEvaluationRequest,
   SpanData,
@@ -76,14 +77,15 @@ export async function runNewEvaluation(
     (d) => d.datasetId === datasetId
   );
   const datasetVersion = targetDatasetMetadata?.version;
+  const actionConfig = request.options?.actionConfig;
 
   logger.info('Running inference...');
   const evalDataset = await runInference({
     manager,
     actionRef,
     evalFlowInput: EvalInferenceInputSchema.parse({ samples: dataset }),
     auth: request.options?.auth,
-    actionConfig: request.options?.actionConfig,
+    actionConfig,
   });
   const evaluatorActions = await getMatchingEvaluatorActions(
     manager,
@@ -94,7 +96,41 @@ export async function runNewEvaluation(
     manager,
     evaluatorActions,
     evalDataset,
-    augments: { actionRef, datasetId, datasetVersion },
+    augments: { actionRef, datasetId, datasetVersion, actionConfig },
+  });
+  return evalRun.key;
+}
+
+/**
+ * Starts a new evaluation run. Intended to be used via the reflection API.
+ */
+export async function interimRunNewEvaluation(
+  manager: RuntimeManager,
+  request: InterimRunNewEvaluationRequest
+): Promise<EvalRunKey> {
+  const { input, actionRef, evaluators } = request;
+  const actionConfig = request.options?.actionConfig;
+  const evaluatorActions = await getMatchingEvaluatorActions(
+    manager,
+    evaluators
+  );
+
+  const evalDataset = await runInference({
+    manager,
+    actionRef,
+    evalFlowInput: input,
+    auth: request.options?.auth,
+    actionConfig,
+  });
+
+  const evalRun = await runEvaluation({
+    manager,
+    evaluatorActions,
+    evalDataset,
+    augments: {
+      actionRef,
+      actionConfig,
+    },
   });
   return evalRun.key;
 }

diff --git a/genkit-tools/common/src/server/router.ts b/genkit-tools/common/src/server/router.ts
@@ -15,7 +15,12 @@
  */
 import { initTRPC, TRPCError } from '@trpc/server';
 import { z } from 'zod';
-import { getDatasetStore, getEvalStore, runNewEvaluation } from '../eval';
+import {
+  getDatasetStore,
+  getEvalStore,
+  interimRunNewEvaluation,
+  runNewEvaluation,
+} from '../eval';
 import { RuntimeManager } from '../manager/manager';
 import { GenkitToolsError } from '../manager/types';
 import { Action } from '../types/action';
@@ -241,6 +246,18 @@ export const TOOLS_SERVER_ROUTER = (manager: RuntimeManager) =>
         return response;
       }),
 
+    /**
+     * Interim API to start new evaluation run.
+     *
+     * Will be deprecated in favor of `runNewEvaluation` once datasets are fully supported in the Dev UI */
+    interimRunNewEvaluation: loggedProcedure
+      .input(apis.InterimRunNewEvaluationRequestSchema)
+      .output(evals.EvalRunKeySchema)
+      .mutation(async ({ input }) => {
+        const response = await interimRunNewEvaluation(manager, input);
+        return response;
+      }),
+
     /** Send a screen view analytics event */
     sendPageView: t.procedure
       .input(apis.PageViewSchema)

diff --git a/genkit-tools/common/src/types/apis.ts b/genkit-tools/common/src/types/apis.ts
@@ -137,3 +137,21 @@ export const RunNewEvaluationRequestSchema = z.object({
 export type RunNewEvaluationRequest = z.infer<
   typeof RunNewEvaluationRequestSchema
 >;
+
+export const InterimRunNewEvaluationRequestSchema = z.object({
+  input: EvalInferenceInputSchema,
+  actionRef: z.string(),
+  evaluators: z.array(z.string()).optional(),
+  options: z
+    .object({
+      auth: z.string().optional(),
+      actionConfig: z
+        .any()
+        .describe('addition parameters required for inference')
+        .optional(),
+    })
+    .optional(),
+});
+export type InterimRunNewEvaluationRequest = z.infer<
+  typeof InterimRunNewEvaluationRequestSchema
+>;
diff --git a/genkit-tools/common/src/types/eval.ts b/genkit-tools/common/src/types/eval.ts
@@ -108,6 +108,10 @@ export const EvalRunKeySchema = z.object({
   actionRef: z.string().optional(),
   datasetId: z.string().optional(),
   datasetVersion: z.number().optional(),
+  actionConfig: z
+    .any()
+    .describe('addition parameters required for inference')
+    .optional(),
   evalRunId: z.string(),
   createdAt: z.string(),
 });
@@ -116,6 +120,7 @@ export const EvalKeyAugmentsSchema = EvalRunKeySchema.pick({
   datasetId: true,
   datasetVersion: true,
   actionRef: true,
+  actionConfig: true,
 });
 export type EvalKeyAugments = z.infer<typeof EvalKeyAugmentsSchema>;
 

diff --git a/js/testapps/cat-eval/eval/cat_adoption_qna.json b/js/testapps/cat-eval/eval/cat_adoption_qna.json
@@ -1,5 +1,5 @@
 {
-  "cases": [
+  "samples": [
     {
       "input": "What are typical cat behaviors?",
       "reference": "Cats like to purr, push things away and cuddle."