wandb · Feb 7, 2025
diff --git a/‎.github/workflows/test.yaml
+2-1 b/‎.github/workflows/test.yaml
+2-1
diff --git a/‎docs/docs/quickstart.md
+1-1 b/‎docs/docs/quickstart.md
+1-1
diff --git a/‎noxfile.py
+5-4 b/‎noxfile.py
+5-4
diff --git a/‎pyproject.toml
+1-10 b/‎pyproject.toml
+1-10
diff --git a/‎tests/scorers/test_hallucination_scorer.py
+31-21 b/‎tests/scorers/test_hallucination_scorer.py
+31-21
diff --git a/‎tests/scorers/test_llm_integrations.py
+46-65 b/‎tests/scorers/test_llm_integrations.py
+46-65
diff --git a/‎tests/scorers/test_prompt_injection_llm_guardrail.py
-45 b/‎tests/scorers/test_prompt_injection_llm_guardrail.py
-45
@@ -252,7 +252,7 @@ jobs:
             "notdiamond",
             "openai",
             "vertexai",
-            "scorers_tests",
+            "scorers",
             "pandas-test",
           ]
       fail-fast: false
@@ -320,6 +320,7 @@ jobs:
           WEAVE_SERVER_DISABLE_ECOSYSTEM: 1
           WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
           GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
+          GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
           MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
 
@@ -131,7 +131,7 @@ _In this example, we're using openai so you will need to add an OpenAI [API key]
 
 ## 3. Automated LLM library logging
 
-Calls made to OpenAI, Anthropic and [many more LLM libraries](guides/integrations/) are automatically tracked with Weave, with **LLM metadata**, **token usage** and **cost** being logged automatically. If your LLM library isn't currently one of our integrations you can track calls to other LLMs libraries or frameworks easily by wrapping them with `@weave.op()`.
+Calls made to OpenAI, Anthropic and [many more LLM libraries](./guides/integrations/index.md) are automatically tracked with Weave, with **LLM metadata**, **token usage** and **cost** being logged automatically. If your LLM library isn't currently one of our integrations you can track calls to other LLMs libraries or frameworks easily by wrapping them with `@weave.op()`.
 
 ## 4. See traces of your application in your project
 
 
@@ -16,7 +16,7 @@
     "notdiamond",
     "google_ai_studio",
     "bedrock",
-    "scorers_tests",
+    "scorers",
 ]
 
 
@@ -56,7 +56,7 @@ def lint(session):
         "openai",
         "vertexai",
         "bedrock",
-        "scorers_tests",
+        "scorers",
         "pandas-test",
     ],
 )
@@ -90,8 +90,9 @@ def tests(session, shard):
 
     # we are doing some integration test in test_llm_integrations.py that requires
     # setting some environment variables for the LLM providers
-    if shard == "scorers_tests":
+    if shard == "scorers":
         env["GOOGLE_API_KEY"] = session.env.get("GOOGLE_API_KEY")
+        env["GEMINI_API_KEY"] = session.env.get("GEMINI_API_KEY")
         env["ANTHROPIC_API_KEY"] = session.env.get("ANTHROPIC_API_KEY")
         env["MISTRAL_API_KEY"] = session.env.get("MISTRAL_API_KEY")
         env["OPENAI_API_KEY"] = session.env.get("OPENAI_API_KEY")
@@ -103,7 +104,7 @@ def tests(session, shard):
         "trace_server": ["trace_server/"],
         "mistral0": ["integrations/mistral/v0/"],
         "mistral1": ["integrations/mistral/v1/"],
-        "scorers_tests": ["scorers/"],
+        "scorers": ["scorers/"],
     }
 
     test_dirs = test_dirs_dict.get(shard, default_test_dirs)
 
@@ -76,16 +76,7 @@ litellm = ["litellm>=1.36.1"]
 llamaindex = ["llama-index>=0.10.35"]
 mistral0 = ["mistralai>=0.1.8,<1.0.0"]
 mistral1 = ["mistralai>=1.0.0"]
-scorers = ["Levenshtein>=0.26.0", "instructor>=1.5.2"]
-scorers_tests = [
-  "instructor>=1.5.2",
-  "Levenshtein>=0.26.0",
-  "openai>=1.0.0",
-  "google-generativeai>=0.8.0",
-  "mistralai>=1.0.3",
-  "anthropic>=0.30.0",
-  "litellm>=1.58.2",
-]
+scorers = ["Levenshtein>=0.26.0","litellm>=1.58"]
 notdiamond = ["notdiamond>=0.3.21", "litellm<=1.49.1"]
 openai = ["openai>=1.0.0"]
 pandas-test = ["pandas>=2.2.3"]
 
@@ -1,49 +1,59 @@
+import json
+
 import pytest
-from openai import OpenAI
+from pydantic import BaseModel
 
 import weave
 from weave.scorers import (
     HallucinationFreeScorer,
 )
 from weave.scorers.hallucination_scorer import (
-    HallucinationReasoning,
     HallucinationResponse,
 )
 
 
-# mock the create function
 @pytest.fixture
-def mock_create(monkeypatch):
-    def _mock_create(*args, **kwargs):
-        return HallucinationResponse(
-            chain_of_thought="The output is consistent with the input data.",
-            reasonings=[
-                HallucinationReasoning(
-                    observation="My observation for this is that the output is consistent with the input data.",
-                    hallucination_type="No Hallucination",
-                )
+def hallucination_scorer(monkeypatch):
+    async def _mock_acompletion(*args, **kwargs):
+        content = {
+            "chain_of_thought": "The output is consistent with the input data.",
+            "reasonings": [
+                {
+                    "hallucination_type": "No Hallucination",
+                    "observation": "My observation for this is that the output is consistent with the input data.",
+                }
             ],
-            conclusion="The output is consistent with the input data.",
-            has_hallucination=True,
-        )
+            "conclusion": "The output is consistent with the input data.",
+            "has_hallucination": True,
+        }
 
-    monkeypatch.setattr("weave.scorers.hallucination_scorer.create", _mock_create)
+        class Message(BaseModel):
+            content: str
 
+        class Choice(BaseModel):
+            message: Message
+
+        class Response(BaseModel):
+            choices: list[Choice]
+
+        return Response(choices=[Choice(message=Message(content=json.dumps(content)))])
+
+    monkeypatch.setattr(
+        "weave.scorers.hallucination_scorer.acompletion", _mock_acompletion
+    )
 
-@pytest.fixture
-def hallucination_scorer(mock_create):
     return HallucinationFreeScorer(
-        client=OpenAI(api_key="DUMMY_API_KEY"),
         model_id="gpt-4o",
         temperature=0.7,
         max_tokens=4096,
     )
 
 
-def test_hallucination_scorer_score(hallucination_scorer, mock_create):
+@pytest.mark.asyncio
+async def test_hallucination_scorer_score(hallucination_scorer):
     output = "John's favorite cheese is cheddar."
     context = "John likes various types of cheese."
-    result = hallucination_scorer.score(output=output, context=context)
+    result = await hallucination_scorer.score(output=output, context=context)
     # we should be able to do this validation
     _ = HallucinationResponse.model_validate(result)
 
 
@@ -7,76 +7,57 @@
     SummarizationScorer,
 )
 
-# Define providers and their models
-TEST_MODELS = {
-    "openai": ["gpt-4o-mini", "gpt-4o"],
-    "anthropic": ["claude-3-haiku-20240307", "claude-3-5-sonnet-20240620"],
-    "mistral": ["mistral-small-latest", "mistral-large-latest"],
-    "gemini": ["gemini-1.5-flash", "gemini-1.5-pro-latest"],
+# Centralized dictionary for providers with their associated models and API key environment variable.
+PROVIDERS = {
+    "openai": {
+        "models": ["openai/gpt-4o"],
+        "env_key": "OPENAI_API_KEY",
+    },
+    "anthropic": {
+        "models": ["anthropic/claude-3-5-sonnet-20240620"],
+        "env_key": "ANTHROPIC_API_KEY",
+    },
+    "mistral": {
+        "models": ["mistral/mistral-large-latest"],
+        "env_key": "MISTRAL_API_KEY",
+    },
+    "gemini": {
+        "models": ["gemini/gemini-2.0-flash"],
+        "env_key": "GOOGLE_API_KEY",
+    },
 }
 
 
-def get_client_and_model(provider, model):
-    api_key_env_vars = {
-        "openai": "OPENAI_API_KEY",
-        "anthropic": "ANTHROPIC_API_KEY",
-        "mistral": "MISTRAL_API_KEY",
-        "gemini": "GOOGLE_API_KEY",
-    }
-
-    if provider not in TEST_MODELS:
-        raise ValueError(f"Unknown provider: {provider}")
-
-    if model not in TEST_MODELS[provider]:
-        raise ValueError(f"Model '{model}' not available for provider '{provider}'")
-
-    api_key = os.getenv(api_key_env_vars[provider])
-    if not api_key:
-        raise OSError(
-            f"API key for {provider} not found. Please set '{api_key_env_vars[provider]}' environment variable."
-        )
-
-    if provider == "openai":
-        from openai import OpenAI
-
-        client = OpenAI(api_key=api_key)
-    elif provider == "anthropic":
-        from anthropic import Anthropic
-
-        client = Anthropic(api_key=api_key)
-    elif provider == "mistral":
-        from mistralai import Mistral
-
-        client = Mistral(api_key=api_key)
-    elif provider == "gemini":
-        import google.generativeai as genai
-
-        genai.configure(api_key=api_key)
-        client = genai.GenerativeModel(model_name=model)
-        model = "gemini"  # Adjust if necessary
-
-    return client, model
-
-
-# Generate test parameters
-test_params = [
-    (provider, model) for provider, models in TEST_MODELS.items() for model in models
-]
-
-
-@pytest.mark.parametrize("provider,model", test_params, ids=lambda p: f"{p[0]}:{p[1]}")
-def test_summarization_scorer_evaluate_summary(provider, model):
-    client, model_id = get_client_and_model(provider, model)
-
-    summarization_scorer = SummarizationScorer(
-        client=client,
-        model_id=model_id,
+@pytest.fixture(
+    params=[
+        (provider, model)
+        for provider, cfg in PROVIDERS.items()
+        for model in cfg["models"]
+    ],
+    ids=lambda p: f"{p[0]}:{p[1]}",
+)
+def summarization_scorer(request):
+    """
+    Fixture that returns an instance of SummarizationScorer.
+    It checks if the required API key is available, and if not, skips the test.
+    """
+    provider, model = request.param
+    env_key = PROVIDERS[provider]["env_key"]
+    if not os.getenv(env_key):
+        pytest.skip(f"API key for {provider} not found. Skipping test.")
+    return SummarizationScorer(
+        model_id=model,
         temperature=0.7,
         max_tokens=1024,
     )
-    input_text = "This is the original text."
-    summary_text = "This is the summary."
-    result = summarization_scorer.evaluate_summary(
-        input=input_text, summary=summary_text
+
+
+@pytest.mark.asyncio
+async def test_summarization_scorer_evaluate_summary(summarization_scorer):
+    input_text = "The wolf is lonely in the forest. He is not happy that the fox is not with him."
+    summary_text = "Wolf is lonely and missing the fox."
+    result = await summarization_scorer._evaluate_summary(
+        input=input_text,
+        summary=summary_text,
     )
     assert isinstance(result, SummarizationEvaluationResponse)