Type annotation updates (#2)

* Update dev.dependencies section for Poetry. * Spelling in comment. * Remove empty line. * Documentation update. * Documentation updates. * Documentation update. Simplify chat_output signature. Add (doc)test. * Documentation update. * Add mypy to Nox. * Return 0 instead of None. * Add Ruff rules. * Fix from logging.warn to logging.warning. Also suspend pre-commit Ruff. * Change variable name to lower case. * Remove COM Ruff rule. * Optimize return value. * Ignore Ruff logging format errors. * Ignore lowercase for arguments, function in test cases. * Ruff fixes. * Ruff fixes. * Use list comprhension for dict_subs. * Re-enable Ruff on pre-commit. Ignore BLE RUff errors. * Add comment. * Typing fix. Mypy setting in toml. * Typing clarification. * Correct type annotations per mypy feedback. * Type annotation corrections as per mypy (non-exhaustive). * Ignore type checking for mdutils (as no annotations exist). * Add mypy. * Version update.
EJOOSTEROP · Aug 31, 2023 · e4b6985 · e4b6985
1 parent a66214f
commit e4b6985
Show file tree

Hide file tree

Showing 8 changed files with 217 additions and 91 deletions.
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -10,10 +10,25 @@
 
 	// Features to add to the dev container. More info: https://containers.dev/features.
 	"features": {
-		"ghcr.io/devcontainers/features/common-utils:2": {},
-		"ghcr.io/devcontainers-contrib/features/poetry:2": {},
-		"ghcr.io/devcontainers-contrib/features/nox:2": {},
-		"ghcr.io/devcontainers-contrib/features/pre-commit:2": {}
+		"ghcr.io/devcontainers/features/common-utils:2": {
+			"installZsh": true,
+			"installOhMyZsh": true,
+			"installOhMyZshConfig": true,
+			"upgradePackages": true,
+			"username": "devcontainer"
+		},
+		"ghcr.io/devcontainers-contrib/features/poetry:2": {
+			"version": "latest"
+		},
+		"ghcr.io/devcontainers-contrib/features/nox:2": {
+			"version": "latest"
+		},
+		"ghcr.io/devcontainers-contrib/features/pre-commit:2": {
+			"version": "latest"
+		},
+		"ghcr.io/devcontainers-contrib/features/mypy:2": {
+			"version": "latest"
+		}
 	},
 
 	// Use 'forwardPorts' to make a list of ports inside the container available locally.

diff --git a/noxfile.py b/noxfile.py
@@ -40,10 +40,18 @@ def ruff(session):
 @nox.session
 def test(session):
     # Not certain this is a good approach. But it currently works.
-    # session.install("pytest")
-    # session.install("pytest-cov")
-
     session.run("pytest", "--cov=quke", "tests/")
 
-    # test_files = session.posargs if session.posargs else []
-    # session.run("pytest", "--cov=quke", *test_files)
+    # TODO: test_files = session.posargs if session.posargs else []
+    # TODO: session.run("pytest", "--cov=quke", *test_files)
+
+
+@nox.session
+def mypy(session):
+    session.install("mypy")
+    session.run(
+        "mypy",
+        "./quke",
+        "--python-executable",
+        "/home/vscode/.cache/pypoetry/virtualenvs/quke-61FoJWY3-py3.11/bin/python",
+    )
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "quke"
-version = "0.2.0"
+version = "0.2.1"
 description = "Compare the answering capabilities of different LLMs - for example LlaMa, ChatGPT, Cohere, Falcon - against user provided document(s) and questions."
 authors = ["Erik Oosterop"]
 maintainers = ["Erik Oosterop"]
@@ -36,14 +36,10 @@ replicate = "^0.9.0"
 mdutils = "^1.6.0"
 rich = "^13.5.2"
 
-
-[tool.poetry.dev-dependencies]
-pytest = "^7.4.0"
-requests-mock = "^1.11.0"
-
-
 [tool.poetry.group.dev.dependencies]
 pytest-cov = "^4.1.0"
+pytest = "^7.4.0"
+requests-mock = "^1.11.0"
 
 [build-system]
 requires = ["poetry-core"]
@@ -61,22 +57,45 @@ line-length = 119
 select = [  # https://beta.ruff.rs/docs/rules/
     "A",  # prevent using keywords that clobber python builtins
     "ANN",  # type annotation
+    "ARG",  # unused arguments
     "B",  # bugbear: security warnings
+    # "BLE",  # blind exceptions
     "C",
-    "C90",
+    # "COM",  # commas
+    "C4",  # comprehension
+    "C90",  # McCabe complexity
     "D",  # pydocstyle
     # "DAR", # darglint, but does not seem to be implemented at the moment
     "DTZ",  # date timezone
     "E",  # pycodestyle
+    "EM",  # error messages
+    "ERA",  # eradicate
+    "EXE",  # executables
     "F",  # pyflakes
+    "FLY",  # f-strings
+    # "G",  # logging format (no f-string)
     "I",  # isort
+    "ICN",  # import conventions
+    "INT",  # gettext
     "ISC",  # implicit string concatenation
+    "N",  # pep8 naming
+    "PERF",  # performance lint
+    "PIE",  #
     "PT",  # pytest style
     "PTH",  # use pathlib
-    "Q",
+    "Q",  # quotes
+    "RET",  # return values
+    "RSE",  # error parenthesis
+    "RUF",  # ruff rules
     "S",  # Bandit
     "SIM",  # simplify
+    "TCH",  # type checking
+    # "TD",  # TODO
+    "TID",  # tidy imports
+    "TRY",  # tryceratops
+    "T20",  # print statement
     "UP",  # alert you when better syntax is available in your python version
+    "W",  # pycodestyle warnings
     "RUF",  # the ruff developer's own rules
 ]
 
@@ -98,7 +117,7 @@ fixable = [
 
 [tool.ruff.per-file-ignores]
 "tests/**/*.py" = [
-    # at least this three should be fine in tests:
+    # at least these three should be fine in tests:
     "S101", # asserts allowed in tests...
     "ANN",  # TODO: do not care about type annotations in tests for now
     "ARG", # Unused function args -> fixtures nevertheless are functionally relevant...
@@ -107,6 +126,7 @@ fixable = [
     "PLR2004", # Magic value used in comparison, ...
     "S311", # Standard pseudo-random generators are not suitable for cryptographic purposes
     "D",  # no pydocstyle
+    "N",  # Argument, function to lowercase
 ]
 
 "noxfile.py" = [
@@ -115,4 +135,7 @@ fixable = [
 ]
 
 [tool.ruff.pydocstyle]
-convention = "google"
+convention = "google"
+
+[tool.mypy]
+disallow_incomplete_defs = true
diff --git a/quke/embed.py b/quke/embed.py
@@ -7,6 +7,7 @@
 from collections import defaultdict
 from dataclasses import dataclass, field
 from pathlib import Path
+from typing import Iterator
 
 # [ ] TODO: PyMU is faster, PyPDF more accurate: https://github.com/py-pdf/benchmarks
 from langchain.document_loaders import CSVLoader, PyMuPDFLoader, TextLoader
@@ -20,7 +21,10 @@ class DocumentLoaderDef:
 
     ext: str = "pdf"
     loader: object = PyMuPDFLoader
-    kwargs: defaultdict[dict] = field(default_factory=dict)  # empty dict
+    # TODO: Remove this - kwargs: defaultdict[dict] = field(default_factory=dict)  # empty dict
+    kwargs: dict[str, str] = field(
+        default_factory=lambda: defaultdict(dict)
+    )  # empty dict
 
 
 DOC_LOADERS = [
@@ -41,6 +45,9 @@ def get_loaders(src_doc_folder: str, loader: DocumentLoaderDef) -> list:
         src_doc_folder: The folder of the source files.
         loader: Definition of the loader. Loaders exist for example for
         pdf, text and csv files.
+
+    Returns:
+        A list of loaders to be used to read the text from source documents.
     """
     ext = loader.ext
 
@@ -51,15 +58,20 @@ def get_loaders(src_doc_folder: str, loader: DocumentLoaderDef) -> list:
 
     # TODO: Problem with embedding more than 2 files at once, or some number of pages/chunks (using HF)?
     # Error message does not really help. Appending in steps does work.
-    loaders = [
+    return [
         loader.loader(str(pdf_name), **loader.kwargs) for pdf_name in src_file_names
     ]
 
-    return loaders
-
 
 def get_pages_from_document(src_doc_folder: str) -> list:
-    """Reads documents from the directory/folder provided and returns a list of pages and metadata."""
+    """Reads documents from the directory/folder provided and returns a list of pages and metadata.
+
+    Args:
+        src_doc_folder: Folder containing the source documents.
+
+    Returns:
+        List containing one page per list item, as text.
+    """
     pages = []
     for docloader in DOC_LOADERS:
         for loader in get_loaders(src_doc_folder, docloader):
@@ -79,9 +91,20 @@ def get_pages_from_document(src_doc_folder: str) -> list:
 
 
 def get_chunks_from_pages(pages: list, splitter_params: dict) -> list:
-    """Splits pages into smaller chunks used for embedding."""
-    # for splitter args containing 'func', the yaml value is converted into a Python function.
-    # TODO: Security risk? Hence a safe_list of functions is provided; severly limiting flexibility.
+    """Splits pages into smaller chunks used for embedding.
+
+    Args:
+        pages: List with page text of a document(s).
+        splitter_params: Dictionary with settings for splitting logic, having
+        keys splitter_args and splitter_import.
+        splitter_args are provided to the splitter function as **kwargs. Note that if a keyword
+        contains 'func' the value will be evaluated as a python function (only 'len' allowed).
+
+    Returns:
+        A list of smaller text chunks from the pages. In a next step to be used for embedding.
+    """
+    # TODO: eval() is a security risk. Hence a safe_list of functions is provided; severely
+    # limiting risk and flexibility.
     # TODO: The other limiting factor: any parameter containing 'func' is eval()-ed into a function reference;
     # also no other parameter is.
     safe_function_list = ["len"]
@@ -115,7 +138,22 @@ def embed(
     splitter_params: dict,
     write_mode: DatabaseAction = DatabaseAction.NO_OVERWRITE,
 ) -> int:
-    """Reads documents from a provided directory, performs embedding and captures the embeddings in a vector store."""
+    """Reads documents from a provided directory, performs embedding and captures the embeddings in a vector store.
+
+    Args:
+        src_doc_folder: Folder containing the source documents.
+        vectordb_location (str): Folder of vector store database.
+        embedding_import: Definition for embedding model.
+        embedding_kwargs: **kwargs to be provided to embedding class.
+        vectordb_import: Definition of vector store.
+        rate_limit: Rate limiting info. Used as a basic limiter dealing with 3rd party API limits.
+        splitter_params: Specifications for text splitting logic.
+        write_mode: Wether to OVERWRITE, APPEND or NO_OVERWRITE the vector store. NO_OVERWRITE will
+        not embed anything if a vector store exists at the vectordb_location.
+
+    Returns:
+        The number of text chunks embedded.
+    """
     logging.info(f"Starting to embed into VectorDB: {vectordb_location}")
 
     # if folder does not exist, or write_mode is APPEND no need to do anything here.
@@ -131,7 +169,7 @@ def embed(
                 f"{vectordb_location!r}. Remove database folder, or change embedding config "
                 "vectorstore_write_mode to OVERWRITE or APPEND."
             )
-            return
+            return 0
         if (
             write_mode == DatabaseAction.OVERWRITE
         ):  # remove exising database before embedding
@@ -153,7 +191,7 @@ def embed(
     )
 
     # Use chunker to embed in chunks with a wait time in between. As a basic way to deal with some rate limiting.
-    def chunker(seq: list, size: int) -> list:
+    def chunker(seq: list, size: int) -> Iterator[list]:
         return (seq[pos : pos + size] for pos in range(0, len(seq), size))
 
     c = 0
@@ -181,7 +219,18 @@ def embed_these_chunks(
     embedding_kwargs: dict,
     vectordb_import: ClassImportDefinition,
 ) -> int:
-    """Embed the provided chunks and capture into a vector store."""
+    """Embed the provided chunks and capture into a vector store.
+
+    Args:
+        chunks: List of text chunks to be embedded.
+        vectordb_location: Location of the folder containing the embedding database.
+        embedding_import: Definition of embedding model ('to build Python import statement').
+        embedding_kwargs: Dictionary provided as **kwargs for embedding class.
+        vectordb_import: Definition of vector store ('to build Python import statement').
+
+    Returns:
+        Number of chunks embedded and captured in vector store.
+    """
     module = importlib.import_module(embedding_import.module_name)
     class_ = getattr(module, embedding_import.class_name)
     embedding = class_(**embedding_kwargs)