Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit ac028ae

Browse files
authoredFeb 7, 2025··
Merge branch 'master' into feat/google-genai-2.x
2 parents b6035b4 + c6dcf6c commit ac028ae

35 files changed

+549
-595
lines changed
 

‎.github/workflows/test.yaml

+2-1
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ jobs:
252252
"notdiamond",
253253
"openai",
254254
"vertexai",
255-
"scorers_tests",
255+
"scorers",
256256
"pandas-test",
257257
]
258258
fail-fast: false
@@ -320,6 +320,7 @@ jobs:
320320
WEAVE_SERVER_DISABLE_ECOSYSTEM: 1
321321
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
322322
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
323+
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
323324
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
324325
MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
325326
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}

‎docs/docs/quickstart.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ _In this example, we're using openai so you will need to add an OpenAI [API key]
131131

132132
## 3. Automated LLM library logging
133133

134-
Calls made to OpenAI, Anthropic and [many more LLM libraries](guides/integrations/) are automatically tracked with Weave, with **LLM metadata**, **token usage** and **cost** being logged automatically. If your LLM library isn't currently one of our integrations you can track calls to other LLMs libraries or frameworks easily by wrapping them with `@weave.op()`.
134+
Calls made to OpenAI, Anthropic and [many more LLM libraries](./guides/integrations/index.md) are automatically tracked with Weave, with **LLM metadata**, **token usage** and **cost** being logged automatically. If your LLM library isn't currently one of our integrations you can track calls to other LLMs libraries or frameworks easily by wrapping them with `@weave.op()`.
135135

136136
## 4. See traces of your application in your project
137137

‎noxfile.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
"notdiamond",
1717
"google_ai_studio",
1818
"bedrock",
19-
"scorers_tests",
19+
"scorers",
2020
]
2121

2222

@@ -56,7 +56,7 @@ def lint(session):
5656
"openai",
5757
"vertexai",
5858
"bedrock",
59-
"scorers_tests",
59+
"scorers",
6060
"pandas-test",
6161
],
6262
)
@@ -90,8 +90,9 @@ def tests(session, shard):
9090

9191
# we are doing some integration test in test_llm_integrations.py that requires
9292
# setting some environment variables for the LLM providers
93-
if shard == "scorers_tests":
93+
if shard == "scorers":
9494
env["GOOGLE_API_KEY"] = session.env.get("GOOGLE_API_KEY")
95+
env["GEMINI_API_KEY"] = session.env.get("GEMINI_API_KEY")
9596
env["ANTHROPIC_API_KEY"] = session.env.get("ANTHROPIC_API_KEY")
9697
env["MISTRAL_API_KEY"] = session.env.get("MISTRAL_API_KEY")
9798
env["OPENAI_API_KEY"] = session.env.get("OPENAI_API_KEY")
@@ -103,7 +104,7 @@ def tests(session, shard):
103104
"trace_server": ["trace_server/"],
104105
"mistral0": ["integrations/mistral/v0/"],
105106
"mistral1": ["integrations/mistral/v1/"],
106-
"scorers_tests": ["scorers/"],
107+
"scorers": ["scorers/"],
107108
}
108109

109110
test_dirs = test_dirs_dict.get(shard, default_test_dirs)

‎pyproject.toml

+1-10
Original file line numberDiff line numberDiff line change
@@ -76,16 +76,7 @@ litellm = ["litellm>=1.36.1"]
7676
llamaindex = ["llama-index>=0.10.35"]
7777
mistral0 = ["mistralai>=0.1.8,<1.0.0"]
7878
mistral1 = ["mistralai>=1.0.0"]
79-
scorers = ["Levenshtein>=0.26.0", "instructor>=1.5.2"]
80-
scorers_tests = [
81-
"instructor>=1.5.2",
82-
"Levenshtein>=0.26.0",
83-
"openai>=1.0.0",
84-
"google-generativeai>=0.8.0",
85-
"mistralai>=1.0.3",
86-
"anthropic>=0.30.0",
87-
"litellm>=1.58.2",
88-
]
79+
scorers = ["Levenshtein>=0.26.0","litellm>=1.58"]
8980
notdiamond = ["notdiamond>=0.3.21", "litellm<=1.49.1"]
9081
openai = ["openai>=1.0.0"]
9182
pandas-test = ["pandas>=2.2.3"]

‎tests/scorers/test_hallucination_scorer.py

+31-21
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,59 @@
1+
import json
2+
13
import pytest
2-
from openai import OpenAI
4+
from pydantic import BaseModel
35

46
import weave
57
from weave.scorers import (
68
HallucinationFreeScorer,
79
)
810
from weave.scorers.hallucination_scorer import (
9-
HallucinationReasoning,
1011
HallucinationResponse,
1112
)
1213

1314

14-
# mock the create function
1515
@pytest.fixture
16-
def mock_create(monkeypatch):
17-
def _mock_create(*args, **kwargs):
18-
return HallucinationResponse(
19-
chain_of_thought="The output is consistent with the input data.",
20-
reasonings=[
21-
HallucinationReasoning(
22-
observation="My observation for this is that the output is consistent with the input data.",
23-
hallucination_type="No Hallucination",
24-
)
16+
def hallucination_scorer(monkeypatch):
17+
async def _mock_acompletion(*args, **kwargs):
18+
content = {
19+
"chain_of_thought": "The output is consistent with the input data.",
20+
"reasonings": [
21+
{
22+
"hallucination_type": "No Hallucination",
23+
"observation": "My observation for this is that the output is consistent with the input data.",
24+
}
2525
],
26-
conclusion="The output is consistent with the input data.",
27-
has_hallucination=True,
28-
)
26+
"conclusion": "The output is consistent with the input data.",
27+
"has_hallucination": True,
28+
}
2929

30-
monkeypatch.setattr("weave.scorers.hallucination_scorer.create", _mock_create)
30+
class Message(BaseModel):
31+
content: str
3132

33+
class Choice(BaseModel):
34+
message: Message
35+
36+
class Response(BaseModel):
37+
choices: list[Choice]
38+
39+
return Response(choices=[Choice(message=Message(content=json.dumps(content)))])
40+
41+
monkeypatch.setattr(
42+
"weave.scorers.hallucination_scorer.acompletion", _mock_acompletion
43+
)
3244

33-
@pytest.fixture
34-
def hallucination_scorer(mock_create):
3545
return HallucinationFreeScorer(
36-
client=OpenAI(api_key="DUMMY_API_KEY"),
3746
model_id="gpt-4o",
3847
temperature=0.7,
3948
max_tokens=4096,
4049
)
4150

4251

43-
def test_hallucination_scorer_score(hallucination_scorer, mock_create):
52+
@pytest.mark.asyncio
53+
async def test_hallucination_scorer_score(hallucination_scorer):
4454
output = "John's favorite cheese is cheddar."
4555
context = "John likes various types of cheese."
46-
result = hallucination_scorer.score(output=output, context=context)
56+
result = await hallucination_scorer.score(output=output, context=context)
4757
# we should be able to do this validation
4858
_ = HallucinationResponse.model_validate(result)
4959

‎tests/scorers/test_llm_integrations.py

+46-65
Original file line numberDiff line numberDiff line change
@@ -7,76 +7,57 @@
77
SummarizationScorer,
88
)
99

10-
# Define providers and their models
11-
TEST_MODELS = {
12-
"openai": ["gpt-4o-mini", "gpt-4o"],
13-
"anthropic": ["claude-3-haiku-20240307", "claude-3-5-sonnet-20240620"],
14-
"mistral": ["mistral-small-latest", "mistral-large-latest"],
15-
"gemini": ["gemini-1.5-flash", "gemini-1.5-pro-latest"],
10+
# Centralized dictionary for providers with their associated models and API key environment variable.
11+
PROVIDERS = {
12+
"openai": {
13+
"models": ["openai/gpt-4o"],
14+
"env_key": "OPENAI_API_KEY",
15+
},
16+
"anthropic": {
17+
"models": ["anthropic/claude-3-5-sonnet-20240620"],
18+
"env_key": "ANTHROPIC_API_KEY",
19+
},
20+
"mistral": {
21+
"models": ["mistral/mistral-large-latest"],
22+
"env_key": "MISTRAL_API_KEY",
23+
},
24+
"gemini": {
25+
"models": ["gemini/gemini-2.0-flash"],
26+
"env_key": "GOOGLE_API_KEY",
27+
},
1628
}
1729

1830

19-
def get_client_and_model(provider, model):
20-
api_key_env_vars = {
21-
"openai": "OPENAI_API_KEY",
22-
"anthropic": "ANTHROPIC_API_KEY",
23-
"mistral": "MISTRAL_API_KEY",
24-
"gemini": "GOOGLE_API_KEY",
25-
}
26-
27-
if provider not in TEST_MODELS:
28-
raise ValueError(f"Unknown provider: {provider}")
29-
30-
if model not in TEST_MODELS[provider]:
31-
raise ValueError(f"Model '{model}' not available for provider '{provider}'")
32-
33-
api_key = os.getenv(api_key_env_vars[provider])
34-
if not api_key:
35-
raise OSError(
36-
f"API key for {provider} not found. Please set '{api_key_env_vars[provider]}' environment variable."
37-
)
38-
39-
if provider == "openai":
40-
from openai import OpenAI
41-
42-
client = OpenAI(api_key=api_key)
43-
elif provider == "anthropic":
44-
from anthropic import Anthropic
45-
46-
client = Anthropic(api_key=api_key)
47-
elif provider == "mistral":
48-
from mistralai import Mistral
49-
50-
client = Mistral(api_key=api_key)
51-
elif provider == "gemini":
52-
import google.generativeai as genai
53-
54-
genai.configure(api_key=api_key)
55-
client = genai.GenerativeModel(model_name=model)
56-
model = "gemini" # Adjust if necessary
57-
58-
return client, model
59-
60-
61-
# Generate test parameters
62-
test_params = [
63-
(provider, model) for provider, models in TEST_MODELS.items() for model in models
64-
]
65-
66-
67-
@pytest.mark.parametrize("provider,model", test_params, ids=lambda p: f"{p[0]}:{p[1]}")
68-
def test_summarization_scorer_evaluate_summary(provider, model):
69-
client, model_id = get_client_and_model(provider, model)
70-
71-
summarization_scorer = SummarizationScorer(
72-
client=client,
73-
model_id=model_id,
31+
@pytest.fixture(
32+
params=[
33+
(provider, model)
34+
for provider, cfg in PROVIDERS.items()
35+
for model in cfg["models"]
36+
],
37+
ids=lambda p: f"{p[0]}:{p[1]}",
38+
)
39+
def summarization_scorer(request):
40+
"""
41+
Fixture that returns an instance of SummarizationScorer.
42+
It checks if the required API key is available, and if not, skips the test.
43+
"""
44+
provider, model = request.param
45+
env_key = PROVIDERS[provider]["env_key"]
46+
if not os.getenv(env_key):
47+
pytest.skip(f"API key for {provider} not found. Skipping test.")
48+
return SummarizationScorer(
49+
model_id=model,
7450
temperature=0.7,
7551
max_tokens=1024,
7652
)
77-
input_text = "This is the original text."
78-
summary_text = "This is the summary."
79-
result = summarization_scorer.evaluate_summary(
80-
input=input_text, summary=summary_text
53+
54+
55+
@pytest.mark.asyncio
56+
async def test_summarization_scorer_evaluate_summary(summarization_scorer):
57+
input_text = "The wolf is lonely in the forest. He is not happy that the fox is not with him."
58+
summary_text = "Wolf is lonely and missing the fox."
59+
result = await summarization_scorer._evaluate_summary(
60+
input=input_text,
61+
summary=summary_text,
8162
)
8263
assert isinstance(result, SummarizationEvaluationResponse)

‎tests/scorers/test_prompt_injection_llm_guardrail.py

-45
This file was deleted.

0 commit comments

Comments
 (0)
Please sign in to comment.