microsoft · prvenk · Sep 14, 2024 · Sep 14, 2024 · Sep 14, 2024 · Sep 14, 2024
diff --git a/README.md b/README.md
@@ -351,7 +351,7 @@ Every array will produce the combinations of flat configurations when the method
         "temperature": "determines the OpenAI temperature. Valid value ranges from 0 to 1."
     },
     "eval": {
-        "metric_types": "determines the metrics used for evaluation (end-to-end or component-wise metrics using LLMs). Valid values for end-to-end metrics are lcsstr, lcsseq, cosine, jaro_winkler, hamming, jaccard, levenshtein, fuzzy_score, cosine_ochiai, bert_all_MiniLM_L6_v2, bert_base_nli_mean_tokens, bert_large_nli_mean_tokens, bert_large_nli_stsb_mean_tokens, bert_distilbert_base_nli_stsb_mean_tokens, bert_paraphrase_multilingual_MiniLM_L12_v2. Valid values for component-wise LLM-based metrics are llm_answer_relevance, llm_context_precision and llm_context_recall. e.g ['fuzzy_score','bert_all_MiniLM_L6_v2','cosine_ochiai','bert_distilbert_base_nli_stsb_mean_tokens', 'llm_answer_relevance']",
+        "metric_types": "determines the metrics used for evaluation (end-to-end or component-wise metrics using LLMs). Valid values for end-to-end metrics are lcsstr, lcsseq, cosine_ochiai, jaro_winkler, hamming, jaccard, levenshtein, fuzzy_score, cosine_ochiai, bert_all_MiniLM_L6_v2, bert_base_nli_mean_tokens, bert_large_nli_mean_tokens, bert_large_nli_stsb_mean_tokens, bert_distilbert_base_nli_stsb_mean_tokens, bert_paraphrase_multilingual_MiniLM_L12_v2. Valid values for component-wise LLM-based metrics are llm_answer_relevance, llm_context_precision and llm_context_recall. e.g ['fuzzy_score','bert_all_MiniLM_L6_v2','cosine_ochiai','bert_distilbert_base_nli_stsb_mean_tokens', 'llm_answer_relevance']",
     }
 }
 ```
@@ -516,7 +516,7 @@ During the QnA generation step, you may occasionally encounter errors related to
 
 **End-to-end evaluation metrics:** not all the metrics comparing the generated and ground-truth answers are able to capture differences in semantics. For example, metrics such as `levenshtein` or `jaro_winkler` only measure edit distances. The `cosine` metric doesn't allow the comparison of semantics either: it uses the *textdistance* token-based implementation based on term frequency vectors. To calculate the semantic similarity between the generated answers and the expected responses, consider using embedding-based metrics such as Bert scores (`bert_`).
 
-**Component-wise evaluation metrics:** evaluation metrics using LLM-as-judges aren't deterministic. The `llm_` metrics included in the accelerator use the model indicated in the `azure_oai_eval_deployment_name` config field. The prompts used for evaluation instruction can be adjusted and are included in the `prompts.py` file (`llm_answer_relevance_instruction`, `llm_context_recall_instruction`, `llm_context_precision_instruction`).
+**Component-wise evaluation metrics:** evaluation metrics using LLM-as-judges aren't deterministic. The `llm_` metrics included in the accelerator use the model indicated in the `azure_oai_eval_deployment_name` config field. The prompts used for evaluation instruction can be adjusted and are included in the `prompts.py` file (`ragas_answer_relevance_instruction`, `ragas_context_recall_instruction`, `ragas_context_precision_instruction`).
 
 **Retrieval-based metrics:** MAP scores are computed by comparing each retrieved chunk against the question and the chunk used to generate the qna pair. To assess whether a retrieved chunk is relevant or not, the similarity between the retrieved chunk and the concatenation of the end user question and the chunk used in the qna step (`02_qa_generation.py`) is computed using the SpacyEvaluator. Spacy similarity defaults to the average of the token vectors, meaning that the computation is insensitive to the order of the words. By default, the similarity threshold is set to 80% (`spacy_evaluator.py`).
 

diff --git a/config.sample.json b/config.sample.json
@@ -80,8 +80,8 @@
             "bert_all_MiniLM_L6_v2",
             "cosine_ochiai",
             "bert_distilbert_base_nli_stsb_mean_tokens",
-            "llm_answer_relevance",
-            "llm_context_precision"
+            "ragas_answer_relevance",
+            "ragas_context_precision"
         ]
     }
 }
diff --git a/config.schema.json b/config.schema.json
@@ -577,9 +577,14 @@
                             "bert_large_nli_stsb_mean_tokens",
                             "bert_distilbert_base_nli_stsb_mean_tokens",
                             "bert_paraphrase_multilingual_MiniLM_L12_v2",
-                            "llm_answer_relevance",
-                            "llm_context_precision",
-                            "llm_context_recall"
+                            "ragas_answer_relevance",
+                            "ragas_context_precision",
+                            "ragas_context_recall",
+                            "pf_answer_relevance",
+                            "pf_answer_coherence",
+                            "pf_answer_fluency",
+                            "pf_answer_similarity",
+                            "pf_answer_groundedness"
                         ]
                     },
                     "description": "Metrics used for evaluation"

diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -1,3 +1,4 @@
+azure-ai-evaluation==1.0.0b3
 promptflow==1.15.0
 promptflow-tools==1.4.0
 pytest==8.3.3

diff --git a/docs/evaluation-metrics.md b/docs/evaluation-metrics.md
@@ -16,7 +16,7 @@ You can choose which metrics should be calculated in your experiment by updating
 "metric_types": [
     "lcsstr",
     "lcsseq",
-    "cosine",
+    "cosine_ochiai",
     "jaro_winkler",
     "hamming",
     "jaccard",
@@ -37,9 +37,9 @@ You can choose which metrics should be calculated in your experiment by updating
     "bert_large_nli_stsb_mean_tokens",
     "bert_distilbert_base_nli_stsb_mean_tokens",
     "bert_paraphrase_multilingual_MiniLM_L12_v2",
-    "llm_answer_relevance",
-    "llm_context_precision",
-    "llm_context_recall"
+    "ragas_answer_relevance",
+    "ragas_context_precision",
+    "ragas_context_recall"
 ]
 ```
 
@@ -66,9 +66,9 @@ Computes the longest common subsequence (LCS) similarity score between two input
 
 ### Cosine similarity (Ochiai coefficient)
 
-| Configuration Key | Calculation Base     | Possible Values    |
-| ----------------- | -------------------- | ------------------ |
-| `cosine`          | `actual`, `expected` | Percentage (0-100) |
+| Configuration Key        | Calculation Base     | Possible Values    |
+| -------------------------| -------------------- | ------------------ |
+| `cosine_ochiai`          | `actual`, `expected` | Percentage (0-100) |
 
 This coefficient is calculated as the intersection of the term-frequency vectors of the generated answer (actual) and the ground-truth answer (expected) divided by the geometric mean of the sizes of these vectors.
 
@@ -168,7 +168,7 @@ These metrics also require the `chat_model_name` property to be set in the `sear
 
 | Configuration Key  | Calculation Base     | Possible Values                   |
 | ------------------ | -------------------- | --------------------------------- |
-| `llm_answer_relevance` | `actual`, `expected` | From 0 to 1 with 1 being the best |
+| `ragas_answer_relevance` | `actual`, `expected` | From 0 to 1 with 1 being the best |
 
 Scores the relevancy of the answer according to the given question. Answers with incomplete, redundant or unnecessary
 information is penalized.
@@ -177,14 +177,14 @@ information is penalized.
 
 | Configuration Key   | Calculation Base    | Possible Values                                                   |
 | ------------------- | ------------------- | ----------------------------------------------------------------- |
-| `llm_context_precision` | `question`, `retrieved_contexts` | Percentage (0-100) |
+| `ragas_context_precision` | `question`, `retrieved_contexts` | Percentage (0-100) |
 
 Proportion of retrieved contexts relevant to the question. Evaluates whether or not the context generated by the RAG solution is useful for answering a question.
 
 ### LLM Context recall
 
 | Configuration Key   | Calculation Base    | Possible Values                                                   |
 | ------------------- | ------------------- | ----------------------------------------------------------------- |
-| `llm_context_recall` | `question`, `expected`, `retrieved_contexts` | Percentage (0-100) |
+| `ragas_context_recall` | `question`, `expected`, `retrieved_contexts` | Percentage (0-100) |
 
 Estimates context recall by estimating TP and FN using annotated answer (ground truth) and retrieved contexts. In an ideal scenario, all sentences in the ground truth answer should be attributable to the retrieved context.
diff --git a/promptflow/rag-experiment-accelerator/README.md b/promptflow/rag-experiment-accelerator/README.md
@@ -117,7 +117,7 @@ az ml environment create --file ./environment.yaml -w $MLWorkSpaceName
     "cross_encoder_model" :"determines the model used for cross-encoding re-ranking step. Valid value is cross-encoder/stsb-roberta-base",
     "search_types" : "determines the search types used for experimentation. Valid value are search_for_match_semantic, search_for_match_Hybrid_multi, search_for_match_Hybrid_cross, search_for_match_text, search_for_match_pure_vector, search_for_match_pure_vector_multi, search_for_match_pure_vector_cross, search_for_manual_hybrid. e.g. ['search_for_manual_hybrid', 'search_for_match_Hybrid_multi','search_for_match_semantic' ]",
     "retrieve_num_of_documents": "determines the number of chunks to retrieve from the search index",
-    "metric_types" : "determines the metrics used for evaluation purpose. Valid value are lcsstr, lcsseq, cosine, jaro_winkler, hamming, jaccard, levenshtein, fuzzy_score, cosine_ochiai, bert_all_MiniLM_L6_v2, bert_base_nli_mean_tokens, bert_large_nli_mean_tokens, bert_large_nli_stsb_mean_tokens, bert_distilbert_base_nli_stsb_mean_tokens, bert_paraphrase_multilingual_MiniLM_L12_v2, llm_context_precision, llm_answer_relevance. e.g ['fuzzy_score','bert_all_MiniLM_L6_v2','cosine_ochiai','bert_distilbert_base_nli_stsb_mean_tokens']",
+    "metric_types" : "determines the metrics used for evaluation purpose. Valid value are lcsstr, lcsseq, jaro_winkler, hamming, jaccard, levenshtein, fuzzy_score, cosine_ochiai, bert_all_MiniLM_L6_v2, bert_base_nli_mean_tokens, bert_large_nli_mean_tokens, bert_large_nli_stsb_mean_tokens, bert_distilbert_base_nli_stsb_mean_tokens, bert_paraphrase_multilingual_MiniLM_L12_v2, ragas_context_precision, ragas_answer_relevance. e.g ['fuzzy_score','bert_all_MiniLM_L6_v2','cosine_ochiai','bert_distilbert_base_nli_stsb_mean_tokens']",
     "azure_oai_chat_deployment_name":  "determines the Azure OpenAI chat deployment name",
     "azure_oai_eval_deployment_name":  "determines the Azure OpenAI evaluation deployment name",
     "embedding_model_name": "embedding model name",

diff --git a/rag_experiment_accelerator/config/eval_config.py b/rag_experiment_accelerator/config/eval_config.py
@@ -10,7 +10,7 @@ class EvalConfig(BaseConfig):
             "bert_all_MiniLM_L6_v2",
             "cosine_ochiai",
             "bert_distilbert_base_nli_stsb_mean_tokens",
-            "llm_answer_relevance",
-            "llm_context_precision",
+            "ragas_answer_relevance",
+            "ragas_context_precision",
         ]
     )
diff --git a/rag_experiment_accelerator/evaluation/azure_ai_metrics.py b/rag_experiment_accelerator/evaluation/azure_ai_metrics.py
@@ -0,0 +1,145 @@
+from azure.ai.evaluation import (
+    CoherenceEvaluator,
+    FluencyEvaluator,
+    GroundednessEvaluator,
+    RelevanceEvaluator,
+    SimilarityEvaluator,
+)
+
+from rag_experiment_accelerator.config.environment import Environment
+
+
+class AzureAIEvals:
+    """Class that leverages the evaluators from the Promptflow evaluation framework
+    for LLM pipelines"""
+    def __init__(self, environment: Environment, deployment_name: str):
+        self.model_config = {
+            "azure_endpoint": environment.openai_endpoint,
+            "api_key": environment.openai_api_key,
+            "deployment_name": deployment_name
+        }
+
+    def compute_score(
+        self,
+        metric_name: str,
+        question: str,
+        generated_answer: str,
+        ground_truth_answer: str,
+        retrieved_contexts: list[str],
+    ) -> float:
+        """
+        Compute LLM as a judge score based on the Promptflow evaluation framework.
+        """
+        match metric_name:
+            case "azai_answer_relevance":
+                score = self.relevance_evaluator(
+                    question=question, answer=generated_answer
+                )
+            case "azai_answer_coherence":
+                score = self.coherence_evaluator(
+                    question=question, answer=generated_answer
+                )
+            case "azai_answer_similarity":
+                score = self.similarity_evaluator(
+                    question=question,
+                    answer=generated_answer,
+                    ground_truth=ground_truth_answer,
+                )
+            case "azai_answer_fluency":
+                score = self.fluency_evaluator(
+                    question=question, answer=generated_answer
+                )
+            case "azai_answer_groundedness":
+                score = self.groundedness_evaluator(
+                    answer=generated_answer, retrieved_contexts=retrieved_contexts
+                )
+            case _:
+                raise KeyError(f"Invalid metric type: {metric_name}")
+
+        return score
+
+    def relevance_evaluator(self, question: str, answer: str) -> float:
+        eval_fn = RelevanceEvaluator(model_config=self.model_config)
+        score = eval_fn(question=question, answer=answer)
+        return score
+
+    def coherence_evaluator(self, question: str, answer: str) -> float:
+        eval_fn = CoherenceEvaluator(model_config=self.model_config)
+        score = eval_fn(question=question, answer=answer)
+        return score
+
+    def similarity_evaluator(
+        self, question: str, answer: str, ground_truth: str
+    ) -> float:
+        """
+        Equivalence, as a metric, measures the similarity between the predicted answer and the correct answer.
+        If the information and content in the predicted answer is similar or equivalent to the correct answer,
+        then the value of the Equivalence metric should be high, else it should be low. Given the question,
+        correct answer, and predicted answer, determine the value of Equivalence metric using the following
+        rating scale:
+            One star: the predicted answer is not at all similar to the correct answer
+            Two stars: the predicted answer is mostly not similar to the correct answer
+            Three stars: the predicted answer is somewhat similar to the correct answer
+            Four stars: the predicted answer is mostly similar to the correct answer
+            Five stars: the predicted answer is completely similar to the correct answer
+
+        This rating value should always be an integer between 1 and 5.
+        """
+        eval_fn = SimilarityEvaluator(model_config=self.model_config)
+        score = eval_fn(question=question, answer=answer, ground_truth=ground_truth)
+        return score
+
+    def fluency_evaluator(self, question: str, answer: str) -> float:
+        """
+        Fluency measures the quality of individual sentences in the answer,
+        and whether they are well-written and grammatically correct. Consider
+        the quality of individual sentences when evaluating fluency. Given the
+        question and answer, score the fluency of the answer between one to
+        five stars using the following rating scale:
+            One star: the answer completely lacks fluency
+            Two stars: the answer mostly lacks fluency
+            Three stars: the answer is partially fluent
+            Four stars: the answer is mostly fluent
+            Five stars: the answer has perfect fluency
+
+        This rating value should always be an integer between 1 and 5.
+        """
+        eval_fn = FluencyEvaluator(model_config=self.model_config)
+        score = eval_fn(question=question, answer=answer)
+        return score
+
+    def groundedness_evaluator(
+        self, answer: str, retrieved_contexts: list[str]
+    ) -> float:
+        """
+        Groundedness is measured the following way:
+        Given a CONTEXT and an ANSWER about that CONTEXT, rate the following way if the ANSWER is
+        entailed by the CONTEXT,
+            1. 5: The ANSWER follows logically from the information contained in the CONTEXT.
+            2. 1: The ANSWER is logically false from the information contained in the CONTEXT.
+            3. an integer score between 1 and 5 and if such integer score does not exist, use 1:
+                It is not possible to determine whether the ANSWER is true or false without
+                further information. Read the passage of information thoroughly and select the
+                correct answer from the three answer labels. Read the CONTEXT thoroughly to
+                ensure you know what the CONTEXT entails.
+
+        This rating value should always be an integer between 1 and 5.
+
+        Here we have a list of contexts and an answer. We return the best (max) groundedness score
+        when comparing the answer with each context in the list.
+
+        Args:
+            answer (str): The answer generated by the model.
+            retrieved_contexts (list[str]): The list of retrieved contexts for the query.
+
+        Returns:
+            float: The groundedness score generated between the answer and the list of contexts
+        """
+        eval_fn = GroundednessEvaluator(model_config=self.model_config)
+
+        best_score = 0
+        for context in retrieved_contexts:
+            score = eval_fn(context=context, answer=answer)
+            best_score = max(best_score, score)
+
+        return best_score