Skip to content

Commit

Permalink
Merge branch 'master' into feat/presidio-entity-guardrail
Browse files Browse the repository at this point in the history
  • Loading branch information
soumik12345 authored Feb 7, 2025
2 parents 2c3db54 + c6dcf6c commit 2240d4d
Show file tree
Hide file tree
Showing 35 changed files with 550 additions and 593 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ jobs:
"notdiamond",
"openai",
"vertexai",
"scorers_tests",
"scorers",
"pandas-test",
]
fail-fast: false
Expand Down Expand Up @@ -319,6 +319,7 @@ jobs:
WEAVE_SERVER_DISABLE_ECOSYSTEM: 1
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
Expand Down
2 changes: 1 addition & 1 deletion docs/docs/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ _In this example, we're using openai so you will need to add an OpenAI [API key]

## 3. Automated LLM library logging

Calls made to OpenAI, Anthropic and [many more LLM libraries](guides/integrations/) are automatically tracked with Weave, with **LLM metadata**, **token usage** and **cost** being logged automatically. If your LLM library isn't currently one of our integrations you can track calls to other LLMs libraries or frameworks easily by wrapping them with `@weave.op()`.
Calls made to OpenAI, Anthropic and [many more LLM libraries](./guides/integrations/index.md) are automatically tracked with Weave, with **LLM metadata**, **token usage** and **cost** being logged automatically. If your LLM library isn't currently one of our integrations you can track calls to other LLMs libraries or frameworks easily by wrapping them with `@weave.op()`.

## 4. See traces of your application in your project

Expand Down
9 changes: 5 additions & 4 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
"notdiamond",
"google_ai_studio",
"bedrock",
"scorers_tests",
"scorers",
]


Expand Down Expand Up @@ -52,7 +52,7 @@ def lint(session):
"openai",
"vertexai",
"bedrock",
"scorers_tests",
"scorers",
"pandas-test",
],
)
Expand Down Expand Up @@ -83,8 +83,9 @@ def tests(session, shard):

# we are doing some integration test in test_llm_integrations.py that requires
# setting some environment variables for the LLM providers
if shard == "scorers_tests":
if shard == "scorers":
env["GOOGLE_API_KEY"] = session.env.get("GOOGLE_API_KEY")
env["GEMINI_API_KEY"] = session.env.get("GEMINI_API_KEY")
env["ANTHROPIC_API_KEY"] = session.env.get("ANTHROPIC_API_KEY")
env["MISTRAL_API_KEY"] = session.env.get("MISTRAL_API_KEY")
env["OPENAI_API_KEY"] = session.env.get("OPENAI_API_KEY")
Expand All @@ -96,7 +97,7 @@ def tests(session, shard):
"trace_server": ["trace_server/"],
"mistral0": ["integrations/mistral/v0/"],
"mistral1": ["integrations/mistral/v1/"],
"scorers_tests": ["scorers/"],
"scorers": ["scorers/"],
}

test_dirs = test_dirs_dict.get(shard, default_test_dirs)
Expand Down
10 changes: 2 additions & 8 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -75,15 +75,9 @@ litellm = ["litellm>=1.36.1"]
llamaindex = ["llama-index>=0.10.35"]
mistral0 = ["mistralai>=0.1.8,<1.0.0"]
mistral1 = ["mistralai>=1.0.0"]
scorers = ["Levenshtein>=0.26.0", "instructor>=1.5.2"]
scorers_tests = [
"instructor>=1.5.2",
scorers = [
"Levenshtein>=0.26.0",
"openai>=1.0.0",
"google-generativeai>=0.8.0",
"mistralai>=1.0.3",
"anthropic>=0.30.0",
"litellm>=1.58.2",
"litellm>=1.58",
"presidio>=0.1.0",
"presidio-analyzer>=2.2.357",
"presidio-anonymizer>=2.2.357",
Expand Down
52 changes: 31 additions & 21 deletions tests/scorers/test_hallucination_scorer.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,59 @@
import json

import pytest
from openai import OpenAI
from pydantic import BaseModel

import weave
from weave.scorers import (
HallucinationFreeScorer,
)
from weave.scorers.hallucination_scorer import (
HallucinationReasoning,
HallucinationResponse,
)


# mock the create function
@pytest.fixture
def mock_create(monkeypatch):
def _mock_create(*args, **kwargs):
return HallucinationResponse(
chain_of_thought="The output is consistent with the input data.",
reasonings=[
HallucinationReasoning(
observation="My observation for this is that the output is consistent with the input data.",
hallucination_type="No Hallucination",
)
def hallucination_scorer(monkeypatch):
async def _mock_acompletion(*args, **kwargs):
content = {
"chain_of_thought": "The output is consistent with the input data.",
"reasonings": [
{
"hallucination_type": "No Hallucination",
"observation": "My observation for this is that the output is consistent with the input data.",
}
],
conclusion="The output is consistent with the input data.",
has_hallucination=True,
)
"conclusion": "The output is consistent with the input data.",
"has_hallucination": True,
}

monkeypatch.setattr("weave.scorers.hallucination_scorer.create", _mock_create)
class Message(BaseModel):
content: str

class Choice(BaseModel):
message: Message

class Response(BaseModel):
choices: list[Choice]

return Response(choices=[Choice(message=Message(content=json.dumps(content)))])

monkeypatch.setattr(
"weave.scorers.hallucination_scorer.acompletion", _mock_acompletion
)

@pytest.fixture
def hallucination_scorer(mock_create):
return HallucinationFreeScorer(
client=OpenAI(api_key="DUMMY_API_KEY"),
model_id="gpt-4o",
temperature=0.7,
max_tokens=4096,
)


def test_hallucination_scorer_score(hallucination_scorer, mock_create):
@pytest.mark.asyncio
async def test_hallucination_scorer_score(hallucination_scorer):
output = "John's favorite cheese is cheddar."
context = "John likes various types of cheese."
result = hallucination_scorer.score(output=output, context=context)
result = await hallucination_scorer.score(output=output, context=context)
# we should be able to do this validation
_ = HallucinationResponse.model_validate(result)

Expand Down
111 changes: 46 additions & 65 deletions tests/scorers/test_llm_integrations.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,76 +7,57 @@
SummarizationScorer,
)

# Define providers and their models
TEST_MODELS = {
"openai": ["gpt-4o-mini", "gpt-4o"],
"anthropic": ["claude-3-haiku-20240307", "claude-3-5-sonnet-20240620"],
"mistral": ["mistral-small-latest", "mistral-large-latest"],
"gemini": ["gemini-1.5-flash", "gemini-1.5-pro-latest"],
# Centralized dictionary for providers with their associated models and API key environment variable.
PROVIDERS = {
"openai": {
"models": ["openai/gpt-4o"],
"env_key": "OPENAI_API_KEY",
},
"anthropic": {
"models": ["anthropic/claude-3-5-sonnet-20240620"],
"env_key": "ANTHROPIC_API_KEY",
},
"mistral": {
"models": ["mistral/mistral-large-latest"],
"env_key": "MISTRAL_API_KEY",
},
"gemini": {
"models": ["gemini/gemini-2.0-flash"],
"env_key": "GOOGLE_API_KEY",
},
}


def get_client_and_model(provider, model):
api_key_env_vars = {
"openai": "OPENAI_API_KEY",
"anthropic": "ANTHROPIC_API_KEY",
"mistral": "MISTRAL_API_KEY",
"gemini": "GOOGLE_API_KEY",
}

if provider not in TEST_MODELS:
raise ValueError(f"Unknown provider: {provider}")

if model not in TEST_MODELS[provider]:
raise ValueError(f"Model '{model}' not available for provider '{provider}'")

api_key = os.getenv(api_key_env_vars[provider])
if not api_key:
raise OSError(
f"API key for {provider} not found. Please set '{api_key_env_vars[provider]}' environment variable."
)

if provider == "openai":
from openai import OpenAI

client = OpenAI(api_key=api_key)
elif provider == "anthropic":
from anthropic import Anthropic

client = Anthropic(api_key=api_key)
elif provider == "mistral":
from mistralai import Mistral

client = Mistral(api_key=api_key)
elif provider == "gemini":
import google.generativeai as genai

genai.configure(api_key=api_key)
client = genai.GenerativeModel(model_name=model)
model = "gemini" # Adjust if necessary

return client, model


# Generate test parameters
test_params = [
(provider, model) for provider, models in TEST_MODELS.items() for model in models
]


@pytest.mark.parametrize("provider,model", test_params, ids=lambda p: f"{p[0]}:{p[1]}")
def test_summarization_scorer_evaluate_summary(provider, model):
client, model_id = get_client_and_model(provider, model)

summarization_scorer = SummarizationScorer(
client=client,
model_id=model_id,
@pytest.fixture(
params=[
(provider, model)
for provider, cfg in PROVIDERS.items()
for model in cfg["models"]
],
ids=lambda p: f"{p[0]}:{p[1]}",
)
def summarization_scorer(request):
"""
Fixture that returns an instance of SummarizationScorer.
It checks if the required API key is available, and if not, skips the test.
"""
provider, model = request.param
env_key = PROVIDERS[provider]["env_key"]
if not os.getenv(env_key):
pytest.skip(f"API key for {provider} not found. Skipping test.")
return SummarizationScorer(
model_id=model,
temperature=0.7,
max_tokens=1024,
)
input_text = "This is the original text."
summary_text = "This is the summary."
result = summarization_scorer.evaluate_summary(
input=input_text, summary=summary_text


@pytest.mark.asyncio
async def test_summarization_scorer_evaluate_summary(summarization_scorer):
input_text = "The wolf is lonely in the forest. He is not happy that the fox is not with him."
summary_text = "Wolf is lonely and missing the fox."
result = await summarization_scorer._evaluate_summary(
input=input_text,
summary=summary_text,
)
assert isinstance(result, SummarizationEvaluationResponse)
45 changes: 0 additions & 45 deletions tests/scorers/test_prompt_injection_llm_guardrail.py

This file was deleted.

Loading

0 comments on commit 2240d4d

Please sign in to comment.