From 5192269931093379ed81609323231e3cec4f7d04 Mon Sep 17 00:00:00 2001 From: Thomas Capelle Date: Fri, 7 Feb 2025 09:16:06 +0100 Subject: [PATCH 1/4] update docs for litellm changes --- .../guides/evaluation/builtin_scorers.mdx | 199 ++++++++---------- 1 file changed, 86 insertions(+), 113 deletions(-) diff --git a/docs/docs/guides/evaluation/builtin_scorers.mdx b/docs/docs/guides/evaluation/builtin_scorers.mdx index 422ea91ff9b1..cc8c4958b6f6 100644 --- a/docs/docs/guides/evaluation/builtin_scorers.mdx +++ b/docs/docs/guides/evaluation/builtin_scorers.mdx @@ -15,7 +15,9 @@ import TabItem from '@theme/TabItem'; **LLM-evaluators** - The pre-defined scorers that use LLMs support the OpenAI, Anthropic, Google GenerativeAI and MistralAI clients. They also use `weave`'s `InstructorLLMScorer` class, so you'll need to install the [`instructor`](https://github.com/instructor-ai/instructor) Python package to be able to use them. You can get all necessary dependencies with `pip install "weave[scorers]"` + The pre-defined scorers that leverage LLMs now automatically integrate with litellm. + You no longer need to pass an LLM client; just set the `model_id`. + See the supported models [here](https://docs.litellm.ai/docs/providers). ## `HallucinationFreeScorer` @@ -24,12 +26,7 @@ import TabItem from '@theme/TabItem'; ```python from weave.scorers import HallucinationFreeScorer - llm_client = ... # initialize your LLM client here - - scorer = HallucinationFreeScorer( - client=llm_client, - model_id="gpt-4o" - ) + scorer = HallucinationFreeScorer() ``` **Customization:** @@ -44,15 +41,12 @@ import TabItem from '@theme/TabItem'; ```python import asyncio - from openai import OpenAI import weave from weave.scorers import HallucinationFreeScorer - # Initialize clients and scorers - llm_client = OpenAI() + # Initialize scorer with a column mapping if needed. hallucination_scorer = HallucinationFreeScorer( - client=llm_client, - model_id="gpt-4o", + model_id="openai/gpt-4o", # or any other model supported by litellm column_map={"context": "input", "output": "other_col"} ) @@ -73,7 +67,8 @@ import TabItem from '@theme/TabItem'; ) result = asyncio.run(evaluation.evaluate(model)) print(result) - # {'HallucinationFreeScorer': {'has_hallucination': {'true_count': 2, 'true_fraction': 1.0}}, 'model_latency': {'mean': 1.4395725727081299}} + # Example output: + # {'HallucinationFreeScorer': {'has_hallucination': {'true_count': 2, 'true_fraction': 1.0}}, 'model_latency': {'mean': ...}} ``` --- @@ -85,11 +80,8 @@ import TabItem from '@theme/TabItem'; ```python from weave.scorers import SummarizationScorer - llm_client = ... # initialize your LLM client here - scorer = SummarizationScorer( - client=llm_client, - model_id="gpt-4o" + model_id="openai/gpt-4o" # or any other model supported by litellm ) ``` @@ -97,24 +89,22 @@ import TabItem from '@theme/TabItem'; This scorer evaluates summaries in two ways: - 1. **Entity Density:** Checks the ratio of unique entities (like names, places, or things) mentioned in the summary to the total word count in the summary in order to estimate the "information density" of the summary. Uses an LLM to extract the entities. Similar to how entity density is used in the Chain of Density paper, https://arxiv.org/abs/2309.04269 - - 2. **Quality Grading:** Uses an LLM-evaluator to grade the summary as `poor`, `ok`, or `excellent`. These grades are converted to scores (0.0 for poor, 0.5 for ok, and 1.0 for excellent) so you can calculate averages. + 1. **Entity Density:** Checks the ratio of unique entities (like names, places, or things) mentioned in the summary to the total word count in the summary to estimate its information density. An LLM extracts the entities. + 2. **Quality Grading:** An LLM evaluator grades the summary as `poor`, `ok`, or `excellent`. These grades are then mapped to scores (0.0 for poor, 0.5 for ok, and 1.0 for excellent) for aggregate performance evaluation. **Customization:** - - Adjust `summarization_evaluation_system_prompt` and `summarization_evaluation_prompt` to define what makes a good summary. + - Adjust `summarization_evaluation_system_prompt` and `summarization_evaluation_prompt` to tailor the evaluation process. **Notes:** - - This scorer uses the `InstructorLLMScorer` class. - - The `score` method expects the original text that was summarized to be present in the `input` column of the dataset. Use the `column_map` class attribute to map `input` to the correct dataset column if needed. + - The scorer uses litellm internally. + - The `score` method expects the original text (the one being summarized) to be present in the `input` column. Use `column_map` if your dataset uses a different name. - Here you have an example usage of the `SummarizationScorer` in the context of an evaluation: + Here you have an example usage in the context of an evaluation: ```python import asyncio - from openai import OpenAI import weave from weave.scorers import SummarizationScorer @@ -123,24 +113,21 @@ import TabItem from '@theme/TabItem'; async def predict(self, input: str) -> str: return "This is a summary of the input text." - # Initialize clients and scorers - llm_client = OpenAI() - model = SummarizationModel() + # Initialize scorer summarization_scorer = SummarizationScorer( - client=llm_client, - model_id="gpt-4o", + model_id="openai/gpt-4o" # or any other model supported by litellm ) # Create dataset dataset = [ {"input": "The quick brown fox jumps over the lazy dog."}, {"input": "Artificial Intelligence is revolutionizing various industries."} ] - # Run evaluation evaluation = weave.Evaluation(dataset=dataset, scorers=[summarization_scorer]) - results = asyncio.run(evaluation.evaluate(model)) + results = asyncio.run(evaluation.evaluate(SummarizationModel())) print(results) - # {'SummarizationScorer': {'is_entity_dense': {'true_count': 0, 'true_fraction': 0.0}, 'summarization_eval_score': {'mean': 0.0}, 'entity_density': {'mean': 0.0}}, 'model_latency': {'mean': 6.210803985595703e-05}} + # Example output: + # {'SummarizationScorer': {'is_entity_dense': {'true_count': 0, 'true_fraction': 0.0}, 'summarization_eval_score': {'mean': 0.0}, 'entity_density': {'mean': 0.0}}, 'model_latency': {'mean': ...}} ``` --- @@ -151,30 +138,25 @@ import TabItem from '@theme/TabItem'; ```python from weave.scorers import OpenAIModerationScorer - from openai import OpenAI - - oai_client = OpenAI() # initialize your LLM client here scorer = OpenAIModerationScorer( - client=oai_client, - model_id="text-embedding-3-small" + model_id="openai/text-embedding-3-small" # or any other model supported by litellm ) ``` **How It Works:** - - Sends the AI's output to the OpenAI Moderation endpoint and returns a dictionary indicating whether the content is flagged and details about the categories involved. + - Sends the AI's output to the OpenAI Moderation endpoint and returns a structured response indicating if the content is flagged. **Notes:** - Requires the `openai` Python package. - - The client must be an instance of OpenAI's `OpenAI` or `AsyncOpenAI` client. + - The LLM client is now managed internally. - Here you have an example in the context of an evaluation: + Here is an example in the context of an evaluation: ```python import asyncio - from openai import OpenAI import weave from weave.scorers import OpenAIModerationScorer @@ -183,10 +165,10 @@ import TabItem from '@theme/TabItem'; async def predict(self, input: str) -> str: return input - # Initialize clients and scorers - client = OpenAI() - model = MyModel() - moderation_scorer = OpenAIModerationScorer(client=client) + # Initialize scorer + moderation_scorer = OpenAIModerationScorer( + model_id="openai/text-embedding-3-small" # or any other model supported by litellm + ) # Create dataset dataset = [ @@ -196,55 +178,46 @@ import TabItem from '@theme/TabItem'; # Run evaluation evaluation = weave.Evaluation(dataset=dataset, scorers=[moderation_scorer]) - results = asyncio.run(evaluation.evaluate(model)) + results = asyncio.run(evaluation.evaluate(MyModel())) print(results) - # {'OpenAIModerationScorer': {'flagged': {'true_count': 1, 'true_fraction': 0.5}, 'categories': {'violence': {'true_count': 1, 'true_fraction': 1.0}}}, 'model_latency': {'mean': 9.500980377197266e-05}} + # Example output: + # {'OpenAIModerationScorer': {'flagged': {'true_count': 1, 'true_fraction': 0.5}, 'categories': {'violence': {'true_count': 1, 'true_fraction': 1.0}}}, 'model_latency': {'mean': ...}} ``` --- ## `EmbeddingSimilarityScorer` - The `EmbeddingSimilarityScorer` computes the cosine similarity between the embeddings of the AI system's output and a target text from your dataset. It's useful for measuring how similar the AI's output is to a reference text. + The `EmbeddingSimilarityScorer` computes the cosine similarity between the embeddings of the AI system's output and a target text from your dataset. It is useful for measuring how similar the AI's output is to a reference text. ```python from weave.scorers import EmbeddingSimilarityScorer - llm_client = ... # initialise your LlM client - similarity_scorer = EmbeddingSimilarityScorer( - client=llm_client + model_id="openai/text-embedding-3-small", # or any other model supported by litellm target_column="reference_text", # the dataset column to compare the output against - threshold=0.4 # the cosine similarity threshold to use + threshold=0.4 # the cosine similarity threshold ) ``` **Parameters:** - - `target`: This scorer expects a `target` column in your dataset, it will calculate the cosine similarity of the embeddings of the `target` column to the AI system output. If your dataset doesn't contain a column called `target` you can use the scorers `column_map` attribute to map `target` to the appropriate column name in your dataset. See the Column Mapping section for more. - - `threshold` (float): The minimum cosine similarity score between the embedding of the AI system output and the embdedding of the `target`, above which the 2 samples are considered "similar", (defaults to `0.5`). `threshold` can be in a range from -1 to 1: - - 1 indicates identical direction. - - 0 indicates orthogonal vectors. - - -1 indicates opposite direction. - - The correct cosine similarity threshold to set can fluctuate quite a lot depending on your use case, we advise exploring different thresholds. + - `target`: This scorer expects a `target` column in your dataset. It calculates the cosine similarity of the embeddings of the `target` text to the AI system output. Use the `column_map` attribute if the column has a different name. + - `threshold` (float): The minimum cosine similarity score (between -1 and 1) needed to consider the two texts similar (defaults to `0.5`). - Here you have an example usage of the `EmbeddingSimilarityScorer` in the context of an evaluation: + Here is an example usage in the context of an evaluation: ```python import asyncio - from openai import OpenAI import weave from weave.scorers import EmbeddingSimilarityScorer - # Initialize clients and scorers - client = OpenAI() + # Initialize scorer similarity_scorer = EmbeddingSimilarityScorer( - client=client, - threshold=0.7, - column_map={"target": "reference"} + model_id="openai/text-embedding-3-small", # or any other model supported by litellm + target_column="reference_text", + threshold=0.7 ) - # Create dataset dataset = [ { @@ -256,7 +229,6 @@ import TabItem from '@theme/TabItem'; "reference": "Pepe likes various types of cheese.", }, ] - # Define model @weave.op def model(input: str) -> str: @@ -269,14 +241,15 @@ import TabItem from '@theme/TabItem'; ) result = asyncio.run(evaluation.evaluate(model)) print(result) - # {'EmbeddingSimilarityScorer': {'is_similar': {'true_count': 1, 'true_fraction': 0.5}, 'similarity_score': {'mean': 0.8448514031462045}}, 'model_latency': {'mean': 0.45862746238708496}} + # Example output: + # {'EmbeddingSimilarityScorer': {'is_similar': {'true_count': 1, 'true_fraction': 0.5}, 'similarity_score': {'mean': 0.844851403}}, 'model_latency': {'mean': ...}} ``` --- ## `ValidJSONScorer` - The ValidJSONScorer checks whether the AI system's output is valid JSON. This scorer is useful when you expect the output to be in JSON format and need to verify its validity. + The `ValidJSONScorer` checks whether the AI system's output is valid JSON. This scorer is useful when you expect the output to be in JSON format and need to verify its validity. ```python from weave.scorers import ValidJSONScorer @@ -284,7 +257,7 @@ import TabItem from '@theme/TabItem'; json_scorer = ValidJSONScorer() ``` - Here you have an example usage of the `ValidJSONScorer` in the context of an evaluation: + Here is an example in the context of an evaluation: ```python import asyncio @@ -309,14 +282,15 @@ import TabItem from '@theme/TabItem'; evaluation = weave.Evaluation(dataset=dataset, scorers=[json_scorer]) results = asyncio.run(evaluation.evaluate(model)) print(results) - # {'ValidJSONScorer': {'json_valid': {'true_count': 2, 'true_fraction': 1.0}}, 'model_latency': {'mean': 8.58306884765625e-05}} + # Example output: + # {'ValidJSONScorer': {'json_valid': {'true_count': 2, 'true_fraction': 1.0}}, 'model_latency': {'mean': ...}} ``` --- ## `ValidXMLScorer` - The `ValidXMLScorer` checks whether the AI system's output is valid XML. This is useful when expecting XML-formatted outputs. + The `ValidXMLScorer` checks whether the AI system's output is valid XML. It is useful when expecting XML-formatted outputs. ```python from weave.scorers import ValidXMLScorer @@ -324,7 +298,7 @@ import TabItem from '@theme/TabItem'; xml_scorer = ValidXMLScorer() ``` - Here you have an example usage of the `ValidXMLScorer` in the context of an evaluation: + Here is an example in the context of an evaluation: ```python import asyncio @@ -348,7 +322,8 @@ import TabItem from '@theme/TabItem'; evaluation = weave.Evaluation(dataset=dataset, scorers=[xml_scorer]) results = asyncio.run(evaluation.evaluate(model)) print(results) - # {'ValidXMLScorer': {'xml_valid': {'true_count': 2, 'true_fraction': 1.0}}, 'model_latency': {'mean': 8.20159912109375e-05}} + # Example output: + # {'ValidXMLScorer': {'xml_valid': {'true_count': 2, 'true_fraction': 1.0}}, 'model_latency': {'mean': ...}} ``` --- @@ -372,44 +347,45 @@ import TabItem from '@theme/TabItem'; ## RAGAS - `ContextEntityRecallScorer` - The `ContextEntityRecallScorer` estimates context recall by extracting entities from both the AI system's output and the provided context, then computing the recall score. Based on the [RAGAS](https://github.com/explodinggradients/ragas) evaluation library + The `ContextEntityRecallScorer` estimates context recall by extracting entities from both the AI system's output and the provided context, then computing the recall score. It is based on the [RAGAS](https://github.com/explodinggradients/ragas) evaluation library. ```python from weave.scorers import ContextEntityRecallScorer - llm_client = ... # initialise your LlM client - entity_recall_scorer = ContextEntityRecallScorer( - client=llm_client - model_id="your-model-id" + model_id="openai/gpt-4o" ) ``` **How It Works:** - Uses an LLM to extract unique entities from the output and context and calculates recall. - - **Recall** indicates the proportion of important entities from the context that are captured in the output, helping to assess the model's effectiveness in retrieving relevant information. + - **Recall** indicates the proportion of important entities from the context that are captured in the output. - Returns a dictionary with the recall score. **Notes:** - - Expects a `context` column in your dataset, use `column_map` to map `context` to another dataset column if needed. + - Expects a `context` column in your dataset. Use the `column_map` attribute if the column name is different. --- ## RAGAS - `ContextRelevancyScorer` - The `ContextRelevancyScorer` evaluates the relevancy of the provided context to the AI system's output. It helps determine if the context used is appropriate for generating the output. Based on the [RAGAS](https://github.com/explodinggradients/ragas) evaluation library. + The `ContextRelevancyScorer` evaluates the relevancy of the provided context to the AI system's output. It is based on the [RAGAS](https://github.com/explodinggradients/ragas) evaluation library. ```python from weave.scorers import ContextRelevancyScorer - llm_client = ... # initialise your LlM client - relevancy_scorer = ContextRelevancyScorer( - llm_client = ... # initialise your LlM client - model_id="your-model-id" - ) + model_id="openai/gpt-4o", # or any other model supported by litellm + relevancy_prompt=""" + Given the following question and context, rate the relevancy of the context to the question on a scale from 0 to 1. + + Question: {question} + Context: {context} + Relevancy Score (0-1): + """ + ) ``` **How It Works:** @@ -419,15 +395,14 @@ import TabItem from '@theme/TabItem'; **Notes:** - - Expects a `context` column in your dataset, use `column_map` to map `context` to another dataset column if needed. + - Expects a `context` column in your dataset. Use `column_map` if a different name is used. - Customize the `relevancy_prompt` to define how relevancy is assessed. - Here you have an example usage of `ContextEntityRecallScorer` and `ContextRelevancyScorer` in the context of an evaluation: + Here is an example usage in the context of an evaluation: ```python import asyncio from textwrap import dedent - from openai import OpenAI import weave from weave.scorers import ContextEntityRecallScorer, ContextRelevancyScorer @@ -437,9 +412,6 @@ import TabItem from '@theme/TabItem'; "Retrieve relevant context" return "Paris is the capital of France." - - model = RAGModel() - # Define prompts relevancy_prompt: str = dedent(""" Given the following question and context, rate the relevancy of the context to the question on a scale from 0 to 1. @@ -448,20 +420,9 @@ import TabItem from '@theme/TabItem'; Context: {context} Relevancy Score (0-1): """) - - # Initialize clients and scorers - llm_client = OpenAI() - entity_recall_scorer = ContextEntityRecallScorer( - client=client, - model_id="gpt-4o", - ) - - relevancy_scorer = ContextRelevancyScorer( - client=llm_client, - model_id="gpt-4o", - relevancy_prompt=relevancy_prompt - ) - + # Initialize scorers + entity_recall_scorer = ContextEntityRecallScorer() + relevancy_scorer = ContextRelevancyScorer(relevancy_prompt=relevancy_prompt) # Create dataset dataset = [ { @@ -473,17 +434,18 @@ import TabItem from '@theme/TabItem'; "context": "William Shakespeare wrote many famous plays." } ] - # Run evaluation evaluation = weave.Evaluation( dataset=dataset, scorers=[entity_recall_scorer, relevancy_scorer] ) - results = asyncio.run(evaluation.evaluate(model)) + results = asyncio.run(evaluation.evaluate(RAGModel())) print(results) - # {'ContextEntityRecallScorer': {'recall': {'mean': 0.3333333333333333}}, 'ContextRelevancyScorer': {'relevancy_score': {'mean': 0.5}}, 'model_latency': {'mean': 9.393692016601562e-05}} + # Example output: + # {'ContextEntityRecallScorer': {'recall': {'mean': ...}}, + # 'ContextRelevancyScorer': {'relevancy_score': {'mean': ...}}, + # 'model_latency': {'mean': ...}} ``` - ```plaintext @@ -491,3 +453,14 @@ import TabItem from '@theme/TabItem'; ``` + +**Note:** The built-in scorers were calibrated using OpenAI models (e.g. `openai/gpt-4o`, `openai/text-embedding-3-small`). If you wish to experiment with other providers, you can simply update the `model_id`. For example, to use an Anthropic model: + +```python +from weave.scorers import SummarizationScorer + +# Switch to Anthropic's Claude model +summarization_scorer = SummarizationScorer( + model_id="anthropic/claude-3-5-sonnet-20240620" +) +``` From 31ccb41187a26bb58520473cc030c7566df12090 Mon Sep 17 00:00:00 2001 From: Thomas Capelle Date: Fri, 7 Feb 2025 14:01:51 +0100 Subject: [PATCH 2/4] andrew's comments --- docs/docs/guides/evaluation/builtin_scorers.mdx | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/docs/docs/guides/evaluation/builtin_scorers.mdx b/docs/docs/guides/evaluation/builtin_scorers.mdx index cc8c4958b6f6..4f463f1a7d03 100644 --- a/docs/docs/guides/evaluation/builtin_scorers.mdx +++ b/docs/docs/guides/evaluation/builtin_scorers.mdx @@ -14,8 +14,7 @@ import TabItem from '@theme/TabItem'; ``` **LLM-evaluators** - - The pre-defined scorers that leverage LLMs now automatically integrate with litellm. + Update Feb 2025: The pre-defined scorers that leverage LLMs now automatically integrate with litellm. You no longer need to pass an LLM client; just set the `model_id`. See the supported models [here](https://docs.litellm.ai/docs/providers). @@ -81,7 +80,7 @@ import TabItem from '@theme/TabItem'; from weave.scorers import SummarizationScorer scorer = SummarizationScorer( - model_id="openai/gpt-4o" # or any other model supported by litellm + model_id="openai/gpt-4o" # or any other model supported by litellm ) ``` @@ -89,7 +88,7 @@ import TabItem from '@theme/TabItem'; This scorer evaluates summaries in two ways: - 1. **Entity Density:** Checks the ratio of unique entities (like names, places, or things) mentioned in the summary to the total word count in the summary to estimate its information density. An LLM extracts the entities. + 1. **Entity Density:** Checks the ratio of unique entities (like names, places, or things) mentioned in the summary to the total word count in the summary in order to estimate the "information density" of the summary. Uses an LLM to extract the entities. Similar to how entity density is used in the Chain of Density paper, https://arxiv.org/abs/2309.04269 2. **Quality Grading:** An LLM evaluator grades the summary as `poor`, `ok`, or `excellent`. These grades are then mapped to scores (0.0 for poor, 0.5 for ok, and 1.0 for excellent) for aggregate performance evaluation. **Customization:** @@ -115,7 +114,7 @@ import TabItem from '@theme/TabItem'; # Initialize scorer summarization_scorer = SummarizationScorer( - model_id="openai/gpt-4o" # or any other model supported by litellm + model_id="openai/gpt-4o" # or any other model supported by litellm ) # Create dataset dataset = [ @@ -167,7 +166,7 @@ import TabItem from '@theme/TabItem'; # Initialize scorer moderation_scorer = OpenAIModerationScorer( - model_id="openai/text-embedding-3-small" # or any other model supported by litellm + model_id="openai/text-embedding-3-small" # or any other model supported by litellm ) # Create dataset @@ -194,7 +193,7 @@ import TabItem from '@theme/TabItem'; from weave.scorers import EmbeddingSimilarityScorer similarity_scorer = EmbeddingSimilarityScorer( - model_id="openai/text-embedding-3-small", # or any other model supported by litellm + model_id="openai/text-embedding-3-small", # or any other model supported by litellm target_column="reference_text", # the dataset column to compare the output against threshold=0.4 # the cosine similarity threshold ) @@ -214,7 +213,7 @@ import TabItem from '@theme/TabItem'; # Initialize scorer similarity_scorer = EmbeddingSimilarityScorer( - model_id="openai/text-embedding-3-small", # or any other model supported by litellm + model_id="openai/text-embedding-3-small", # or any other model supported by litellm target_column="reference_text", threshold=0.7 ) @@ -377,7 +376,7 @@ import TabItem from '@theme/TabItem'; from weave.scorers import ContextRelevancyScorer relevancy_scorer = ContextRelevancyScorer( - model_id="openai/gpt-4o", # or any other model supported by litellm + model_id="openai/gpt-4o", # or any other model supported by litellm relevancy_prompt=""" Given the following question and context, rate the relevancy of the context to the question on a scale from 0 to 1. From 2c26d3e7881ab39590737c936fd8affa67b64c20 Mon Sep 17 00:00:00 2001 From: Thomas Capelle Date: Fri, 7 Feb 2025 14:03:59 +0100 Subject: [PATCH 3/4] wrong model in moderation api --- docs/docs/guides/evaluation/builtin_scorers.mdx | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/docs/docs/guides/evaluation/builtin_scorers.mdx b/docs/docs/guides/evaluation/builtin_scorers.mdx index 4f463f1a7d03..b245bd8f15e8 100644 --- a/docs/docs/guides/evaluation/builtin_scorers.mdx +++ b/docs/docs/guides/evaluation/builtin_scorers.mdx @@ -138,9 +138,7 @@ import TabItem from '@theme/TabItem'; ```python from weave.scorers import OpenAIModerationScorer - scorer = OpenAIModerationScorer( - model_id="openai/text-embedding-3-small" # or any other model supported by litellm - ) + scorer = OpenAIModerationScorer() ``` **How It Works:** @@ -148,10 +146,6 @@ import TabItem from '@theme/TabItem'; - Sends the AI's output to the OpenAI Moderation endpoint and returns a structured response indicating if the content is flagged. **Notes:** - - - Requires the `openai` Python package. - - The LLM client is now managed internally. - Here is an example in the context of an evaluation: ```python @@ -165,9 +159,7 @@ import TabItem from '@theme/TabItem'; return input # Initialize scorer - moderation_scorer = OpenAIModerationScorer( - model_id="openai/text-embedding-3-small" # or any other model supported by litellm - ) + moderation_scorer = OpenAIModerationScorer() # Create dataset dataset = [ From 5514d46f2f6710a18762277cb6e30aea5e264d60 Mon Sep 17 00:00:00 2001 From: Thomas Capelle Date: Fri, 7 Feb 2025 14:09:00 +0100 Subject: [PATCH 4/4] fix Similarity scorer args names --- docs/docs/guides/evaluation/builtin_scorers.mdx | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/docs/guides/evaluation/builtin_scorers.mdx b/docs/docs/guides/evaluation/builtin_scorers.mdx index b245bd8f15e8..3437a7ef1b0f 100644 --- a/docs/docs/guides/evaluation/builtin_scorers.mdx +++ b/docs/docs/guides/evaluation/builtin_scorers.mdx @@ -186,14 +186,14 @@ import TabItem from '@theme/TabItem'; similarity_scorer = EmbeddingSimilarityScorer( model_id="openai/text-embedding-3-small", # or any other model supported by litellm - target_column="reference_text", # the dataset column to compare the output against threshold=0.4 # the cosine similarity threshold ) ``` + Note: You can use `column_map` to map the `target` column to a different name. + **Parameters:** - - `target`: This scorer expects a `target` column in your dataset. It calculates the cosine similarity of the embeddings of the `target` text to the AI system output. Use the `column_map` attribute if the column has a different name. - `threshold` (float): The minimum cosine similarity score (between -1 and 1) needed to consider the two texts similar (defaults to `0.5`). Here is an example usage in the context of an evaluation: @@ -206,18 +206,17 @@ import TabItem from '@theme/TabItem'; # Initialize scorer similarity_scorer = EmbeddingSimilarityScorer( model_id="openai/text-embedding-3-small", # or any other model supported by litellm - target_column="reference_text", threshold=0.7 ) # Create dataset dataset = [ { "input": "He's name is John", - "reference": "John likes various types of cheese.", + "target": "John likes various types of cheese.", }, { "input": "He's name is Pepe.", - "reference": "Pepe likes various types of cheese.", + "target": "Pepe likes various types of cheese.", }, ] # Define model