Skip to content
This repository was archived by the owner on Dec 10, 2024. It is now read-only.

Add Gemini LLM option #3

Merged
merged 4 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,15 @@ jobs:
python-version: |
3.11
3.12
3.13
- name: Install the latest version of uv
uses: astral-sh/setup-uv@v2
- name: Install project
run: uv sync --dev
- name: Run tests
run: uv run pytest --cov


# - name: Upload coverage reports to Codecov
# uses: codecov/[email protected]
# if: ${{ matrix.os == 'ubuntu-latest' }}
# with:
# token: ${{ secrets.CODECOV_TOKEN }}
- name: Upload coverage reports to Codecov
uses: codecov/[email protected]
if: ${{ matrix.os == 'ubuntu-latest' }}
with:
token: ${{ secrets.CODECOV_TOKEN }}
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Documentation: [ReadTheDocs](https://autoparser.readthedocs.io/en/latest)

Contains functionality to:
1. Create a basic data dictionary from a raw data file (`create-dict`)
2. Use an LLM (currently only ChatGPT via the OpenAI API) to add descriptions to the
2. Use an LLM (currently via either OpenAI or Google's Gemini) to add descriptions to the
data dictionary, to enable better parser auto-generation (`add-descriptions`)
3. Create a mapping csv file linking source to target data fields and value mappings
using the LLM, which can be edited by a user (`create-mapping`)
Expand Down
9 changes: 7 additions & 2 deletions docs/examples/example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,13 @@
"source": [
"This file demonstrates the process of constructing a parser file using `animals.csv` as a source dataset.\n",
"\n",
"Before you start: `autoparser` requires an OpenAI API key to function. You should add yours to your environment, as described [here](https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety). \n",
"Edit the `API_KEY` line below to match the name you gave yours."
"Before you start: `autoparser` requires an LLM API key to function, for either OpenAI or Gemini.\n",
"You should add yours to your environment, as described [here](https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety).\n",
"This example uses the OpenAI API; edit the `API_KEY` line below to match the name you gave yours.\n",
"\n",
"If you would prefer to use Gemini, use the `llm` variable in functions where the api key is used, e.g.\n",
"\n",
"`writer.generate_descriptions(\"fr\", data_dict, key=API_KEY, llm='gemini')`"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion docs/getting_started/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ options available.

AutoParser relies on LLMs to automatically map raw data fields to a target schema.
In order to use this tool, you will need an API key for either [OpenAI](https://platform.openai.com/docs/quickstart/create-and-export-an-api-key)
or Google's [Gemini](https://aistudio.google.com/apikey) [Dev note: work in progress!].
or Google's [Gemini](https://aistudio.google.com/apikey).
AutoParser will use either OpenAI's `gpt-4-mini`, or Google's `gemini-1.5-flash`.

The LLM should *never* see your raw data; only the data dictionary which contains
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ dependencies = [
"pandas>=2.2.3",
"tomli>=2.0.2",
"pydantic>=2.9.2",
"google-generativeai>=0.8.3",
]
scripts = { autoparser = "autoparser:main" }

Expand Down
20 changes: 15 additions & 5 deletions src/autoparser/create_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,19 @@
from pathlib import Path
import pandas as pd
from openai import OpenAI
import google.generativeai as gemini
import warnings
import numpy as np

from .openai_calls import _map_fields as _map_fields_openai
from .openai_calls import _map_values as _map_values_openai
from .gemini_calls import _map_fields as _map_fields_gemini
from .gemini_calls import _map_values as _map_values_gemini
from .util import read_json, read_data, load_data_dict
from .util import DEFAULT_CONFIG

from typing import Literal


class Mapper:
"""
Expand All @@ -34,7 +39,7 @@ class Mapper:
api_key
The API key to use for the LLM
llm
The LLM to use, currently only 'openai' is supported
The LLM to use, currently only 'openai' and 'gemini' are supported
config
The path to the configuration file to use if not using the default configuration
"""
Expand All @@ -45,19 +50,24 @@ def __init__(
data_dictionary: str | pd.DataFrame,
language: str,
api_key: str | None = None,
llm: str | None = "openai",
llm: Literal["openai", "gemini"] | None = "openai",
config: Path | None = None,
):
self.schema = read_json(schema)
self.schema_properties = self.schema["properties"]
self.language = language
self.api_key = api_key
if llm is not None and llm == "openai":
if llm is None:
self.client = None
elif llm == "openai":
self.client = OpenAI(api_key=self.api_key)
self.map_fields = _map_fields_openai
self.map_values = _map_values_openai
elif llm is None:
self.client = None
elif llm == "gemini":
gemini.configure(api_key=self.api_key)
self.client = gemini.GenerativeModel("gemini-1.5-flash")
self.map_fields = _map_fields_gemini
self.map_values = _map_values_gemini
else:
raise ValueError(f"Unsupported LLM: {llm}")

Expand Down
9 changes: 8 additions & 1 deletion src/autoparser/dict_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@
from pathlib import Path
import pandas as pd
from openai import OpenAI
import google.generativeai as gemini
import numpy as np

from .openai_calls import _get_definitions as _get_definitions_openai
from .gemini_calls import _get_definitions as _get_definitions_gemini
from .util import read_data, load_data_dict
from .util import DEFAULT_CONFIG

Expand Down Expand Up @@ -50,14 +52,19 @@ def _setup_llm(self, key: str, name: str):
key
API key
name
Name of the LLM to use (currently only OpenAI is supported)
Name of the LLM to use (currently only OpenAI and Gemini are supported)
"""
self.key = key
if name == "openai":
self.client = OpenAI(api_key=key)

self._get_descriptions = _get_definitions_openai

elif name == "gemini":
gemini.configure(api_key=key)
self.client = gemini.GenerativeModel("gemini-1.5-flash")
self._get_descriptions = _get_definitions_gemini

else:
raise ValueError(f"Unsupported LLM: {name}")

Expand Down
103 changes: 103 additions & 0 deletions src/autoparser/gemini_calls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
"Contains all functions that call Google's Gemini API."
import google.generativeai as gemini
from .util import ColumnDescriptionRequest, MappingRequest, ValuesRequest
import json


def _get_definitions(
headers: list[str], language: str, model: gemini.GenerativeModel
) -> dict[str, str]:
"""
Get the definitions of the columns in the dataset.
"""
result = model.generate_content(
[
(
"You are an expert at structured data extraction. "
"The following is a list of headers from a data file in "
f"{language}, some containing shortened words or abbreviations. "
"Translate them to english. "
"Return a list of (original header, translation) pairs, using the given structure." # noqa
"Preserve special characters such as accented letters and hyphens."
),
f"{headers}",
],
generation_config=gemini.GenerationConfig(
response_mime_type="application/json",
response_schema=ColumnDescriptionRequest,
),
)
descriptions = ColumnDescriptionRequest.model_validate(
json.loads(result.text)
).field_descriptions
return descriptions


def _map_fields(
source_fields: list[str], target_fields: list[str], model: gemini.GenerativeModel
) -> MappingRequest:
"""
Calls the Gemini API to generate a draft mapping between two datasets.
"""
result = model.generate_content(
[
(
"You are an expert at structured data extraction. "
"You will be given two lists of phrases, one is the headers for a "
"target data file, and the other a set of descriptions for columns "
"of source data. "
"Match each target header to the best matching source description, "
"but match a header to None if a good match does not exist. "
"Preserve special characters such as accented letters and hyphens."
"Return the matched target headers and source descriptions using the provided structure." # noqa
),
(
f"These are the target headers: {target_fields}\n"
f"These are the source descriptions: {source_fields}"
),
],
generation_config=gemini.GenerationConfig(
response_mime_type="application/json",
response_schema=MappingRequest,
),
)
return MappingRequest.model_validate(json.loads(result.text))


def _map_values(
values: list[tuple[set[str], set[str], list[str]]],
language: str,
model: gemini.GenerativeModel,
) -> ValuesRequest:
"""
Calls the Gemini API to generate a set of value mappings for the fields.
"""
result = model.generate_content(
[
(
"You are an expert at structured data extraction. "
"You will be given a list of tuples, where each tuple contains "
"three sets of string values. "
"The first set contains field names for a dataset."
"The second set contains values from a source dataset in "
f"{language}, and the third set contains target values for an "
"english-language transformed dataset. "
"Match all the values in the second set to the appropriate values "
"in the third set. "
"Return a list of dictionaries, where each dictionary contains the "
"field name as a key, and a dictionary containing "
"source values as keys, and the target text as values, "
"as the values. For example, the result should look like this: "
"[{'field_name_1': {'source_value_a': 'target_value_a', "
"'source_value_b': 'target_value_b'}, 'field_name_2':{...}]"
"using the provided structure."
"Preserve special characters such as accented letters and hyphens."
),
f"These are the field, source, target value sets: {values}",
],
generation_config=gemini.GenerationConfig(
response_mime_type="application/json",
response_schema=ValuesRequest,
),
)
return ValuesRequest.model_validate(json.loads(result.text))
2 changes: 1 addition & 1 deletion src/autoparser/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def load_data_dict(

class SingleField(BaseModel):
field_name: str
translation: str
translation: str | None


class ColumnDescriptionRequest(BaseModel):
Expand Down
Loading
Loading