globaldothealth · pipliggins · Nov 22, 2024 · Nov 20, 2024 · Nov 21, 2024 · Nov 22, 2024
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -22,16 +22,15 @@ jobs:
           python-version: |
             3.11
             3.12
+            3.13
       - name: Install the latest version of uv
         uses: astral-sh/setup-uv@v2
       - name: Install project
         run: uv sync --dev
       - name: Run tests
         run: uv run pytest --cov
-
-
-    # - name: Upload coverage reports to Codecov
-    #   uses: codecov/[email protected]
-    #   if: ${{ matrix.os == 'ubuntu-latest' }}
-    #   with:
-    #     token: ${{ secrets.CODECOV_TOKEN }}
+      - name: Upload coverage reports to Codecov
+        uses: codecov/[email protected]
+        if: ${{ matrix.os == 'ubuntu-latest' }}
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ Documentation: [ReadTheDocs](https://autoparser.readthedocs.io/en/latest)
 
 Contains functionality to:
 1. Create a basic data dictionary from a raw data file (`create-dict`)
-2. Use an LLM (currently only ChatGPT via the OpenAI API) to add descriptions to the 
+2. Use an LLM (currently via either OpenAI or Google's Gemini) to add descriptions to the 
     data dictionary, to enable better parser auto-generation (`add-descriptions`)
 3. Create a mapping csv file linking source to target data fields and value mappings 
     using the LLM, which can be edited by a user (`create-mapping`)

diff --git a/docs/examples/example.ipynb b/docs/examples/example.ipynb
@@ -13,8 +13,13 @@
    "source": [
     "This file demonstrates the process of constructing a parser file using `animals.csv` as a source dataset.\n",
     "\n",
-    "Before you start: `autoparser` requires an OpenAI API key to function. You should add yours to your environment, as described [here](https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety). \n",
-    "Edit the `API_KEY` line below to match the name you gave yours."
+    "Before you start: `autoparser` requires an LLM API key to function, for either OpenAI or Gemini.\n",
+    "You should add yours to your environment, as described [here](https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety).\n",
+    "This example uses the OpenAI API; edit the `API_KEY` line below to match the name you gave yours.\n",
+    "\n",
+    "If you would prefer to use Gemini, use the `llm` variable in functions where the api key is used, e.g.\n",
+    "\n",
+    "`writer.generate_descriptions(\"fr\", data_dict, key=API_KEY, llm='gemini')`"
    ]
   },
   {

diff --git a/docs/getting_started/index.md b/docs/getting_started/index.md
@@ -23,7 +23,7 @@ options available.
 
 AutoParser relies on LLMs to automatically map raw data fields to a target schema.
 In order to use this tool, you will need an API key for either [OpenAI](https://platform.openai.com/docs/quickstart/create-and-export-an-api-key)
-or Google's [Gemini](https://aistudio.google.com/apikey) [Dev note: work in progress!].
+or Google's [Gemini](https://aistudio.google.com/apikey).
 AutoParser will use either OpenAI's `gpt-4-mini`, or Google's `gemini-1.5-flash`.
 
 The LLM should *never* see your raw data; only the data dictionary which contains 

diff --git a/pyproject.toml b/pyproject.toml
@@ -11,6 +11,7 @@ dependencies = [
     "pandas>=2.2.3",
     "tomli>=2.0.2",
     "pydantic>=2.9.2",
+    "google-generativeai>=0.8.3",
 ]
 scripts = { autoparser = "autoparser:main" }
 

diff --git a/src/autoparser/create_mapping.py b/src/autoparser/create_mapping.py
@@ -6,14 +6,19 @@
 from pathlib import Path
 import pandas as pd
 from openai import OpenAI
+import google.generativeai as gemini
 import warnings
 import numpy as np
 
 from .openai_calls import _map_fields as _map_fields_openai
 from .openai_calls import _map_values as _map_values_openai
+from .gemini_calls import _map_fields as _map_fields_gemini
+from .gemini_calls import _map_values as _map_values_gemini
 from .util import read_json, read_data, load_data_dict
 from .util import DEFAULT_CONFIG
 
+from typing import Literal
+
 
 class Mapper:
     """
@@ -34,7 +39,7 @@ class Mapper:
     api_key
         The API key to use for the LLM
     llm
-        The LLM to use, currently only 'openai' is supported
+        The LLM to use, currently only 'openai' and 'gemini' are supported
     config
         The path to the configuration file to use if not using the default configuration
     """
@@ -45,19 +50,24 @@ def __init__(
         data_dictionary: str | pd.DataFrame,
         language: str,
         api_key: str | None = None,
-        llm: str | None = "openai",
+        llm: Literal["openai", "gemini"] | None = "openai",
         config: Path | None = None,
     ):
         self.schema = read_json(schema)
         self.schema_properties = self.schema["properties"]
         self.language = language
         self.api_key = api_key
-        if llm is not None and llm == "openai":
+        if llm is None:
+            self.client = None
+        elif llm == "openai":
             self.client = OpenAI(api_key=self.api_key)
             self.map_fields = _map_fields_openai
             self.map_values = _map_values_openai
-        elif llm is None:
-            self.client = None
+        elif llm == "gemini":
+            gemini.configure(api_key=self.api_key)
+            self.client = gemini.GenerativeModel("gemini-1.5-flash")
+            self.map_fields = _map_fields_gemini
+            self.map_values = _map_values_gemini
         else:
             raise ValueError(f"Unsupported LLM: {llm}")
 

diff --git a/src/autoparser/dict_writer.py b/src/autoparser/dict_writer.py
@@ -6,9 +6,11 @@
 from pathlib import Path
 import pandas as pd
 from openai import OpenAI
+import google.generativeai as gemini
 import numpy as np
 
 from .openai_calls import _get_definitions as _get_definitions_openai
+from .gemini_calls import _get_definitions as _get_definitions_gemini
 from .util import read_data, load_data_dict
 from .util import DEFAULT_CONFIG
 
@@ -50,14 +52,19 @@ def _setup_llm(self, key: str, name: str):
         key
             API key
         name
-            Name of the LLM to use (currently only OpenAI is supported)
+            Name of the LLM to use (currently only OpenAI and Gemini are supported)
         """
         self.key = key
         if name == "openai":
             self.client = OpenAI(api_key=key)
 
             self._get_descriptions = _get_definitions_openai
 
+        elif name == "gemini":
+            gemini.configure(api_key=key)
+            self.client = gemini.GenerativeModel("gemini-1.5-flash")
+            self._get_descriptions = _get_definitions_gemini
+
         else:
             raise ValueError(f"Unsupported LLM: {name}")
 

diff --git a/src/autoparser/gemini_calls.py b/src/autoparser/gemini_calls.py
@@ -0,0 +1,103 @@
+"Contains all functions that call Google's Gemini API."
+import google.generativeai as gemini
+from .util import ColumnDescriptionRequest, MappingRequest, ValuesRequest
+import json
+
+
+def _get_definitions(
+    headers: list[str], language: str, model: gemini.GenerativeModel
+) -> dict[str, str]:
+    """
+    Get the definitions of the columns in the dataset.
+    """
+    result = model.generate_content(
+        [
+            (
+                "You are an expert at structured data extraction. "
+                "The following is a list of headers from a data file in "
+                f"{language}, some containing shortened words or abbreviations. "
+                "Translate them to english. "
+                "Return a list of (original header, translation) pairs, using the given structure."  # noqa
+                "Preserve special characters such as accented letters and hyphens."
+            ),
+            f"{headers}",
+        ],
+        generation_config=gemini.GenerationConfig(
+            response_mime_type="application/json",
+            response_schema=ColumnDescriptionRequest,
+        ),
+    )
+    descriptions = ColumnDescriptionRequest.model_validate(
+        json.loads(result.text)
+    ).field_descriptions
+    return descriptions
+
+
+def _map_fields(
+    source_fields: list[str], target_fields: list[str], model: gemini.GenerativeModel
+) -> MappingRequest:
+    """
+    Calls the Gemini API to generate a draft mapping between two datasets.
+    """
+    result = model.generate_content(
+        [
+            (
+                "You are an expert at structured data extraction. "
+                "You will be given two lists of phrases, one is the headers for a "
+                "target data file, and the other a set of descriptions for columns "
+                "of source data. "
+                "Match each target header to the best matching source description, "
+                "but match a header to None if a good match does not exist. "
+                "Preserve special characters such as accented letters and hyphens."
+                "Return the matched target headers and source descriptions using the provided structure."  # noqa
+            ),
+            (
+                f"These are the target headers: {target_fields}\n"
+                f"These are the source descriptions: {source_fields}"
+            ),
+        ],
+        generation_config=gemini.GenerationConfig(
+            response_mime_type="application/json",
+            response_schema=MappingRequest,
+        ),
+    )
+    return MappingRequest.model_validate(json.loads(result.text))
+
+
+def _map_values(
+    values: list[tuple[set[str], set[str], list[str]]],
+    language: str,
+    model: gemini.GenerativeModel,
+) -> ValuesRequest:
+    """
+    Calls the Gemini API to generate a set of value mappings for the fields.
+    """
+    result = model.generate_content(
+        [
+            (
+                "You are an expert at structured data extraction. "
+                "You will be given a list of tuples, where each tuple contains "
+                "three sets of string values. "
+                "The first set contains field names for a dataset."
+                "The second set contains values from a source dataset in "
+                f"{language}, and the third set contains target values for an "
+                "english-language transformed dataset. "
+                "Match all the values in the second set to the appropriate values "
+                "in the third set. "
+                "Return a list of dictionaries, where each dictionary contains the "
+                "field name as a key, and a dictionary containing "
+                "source values as keys, and the target text as values, "
+                "as the values. For example, the result should look like this: "
+                "[{'field_name_1': {'source_value_a': 'target_value_a', "
+                "'source_value_b': 'target_value_b'}, 'field_name_2':{...}]"
+                "using the provided structure."
+                "Preserve special characters such as accented letters and hyphens."
+            ),
+            f"These are the field, source, target value sets: {values}",
+        ],
+        generation_config=gemini.GenerationConfig(
+            response_mime_type="application/json",
+            response_schema=ValuesRequest,
+        ),
+    )
+    return ValuesRequest.model_validate(json.loads(result.text))
diff --git a/src/autoparser/util.py b/src/autoparser/util.py
@@ -91,7 +91,7 @@ def load_data_dict(
 
 class SingleField(BaseModel):
     field_name: str
-    translation: str
+    translation: str | None
 
 
 class ColumnDescriptionRequest(BaseModel):