Skip to content

Commit

Permalink
Type annotation updates (#2)
Browse files Browse the repository at this point in the history
* Update dev.dependencies section for Poetry.

* Spelling in comment.

* Remove empty line.

* Documentation update.

* Documentation updates.

* Documentation update. Simplify chat_output signature. Add (doc)test.

* Documentation update.

* Add mypy to Nox.

* Return 0 instead of None.

* Add Ruff rules.

* Fix from logging.warn to logging.warning. Also suspend pre-commit Ruff.

* Change variable name to lower case.

* Remove COM Ruff rule.

* Optimize return value.

* Ignore Ruff logging format errors.

* Ignore lowercase for arguments, function in test cases.

* Ruff fixes.

* Ruff fixes.

* Use list comprhension for dict_subs.

* Re-enable Ruff on pre-commit. Ignore BLE RUff errors.

* Add comment.

* Typing fix. Mypy setting in toml.

* Typing clarification.

* Correct type annotations per mypy feedback.

* Type annotation corrections as per mypy (non-exhaustive).

* Ignore type checking for mdutils (as no annotations exist).

* Add mypy.

* Version update.
  • Loading branch information
EJOOSTEROP authored Aug 31, 2023
1 parent a66214f commit e4b6985
Show file tree
Hide file tree
Showing 8 changed files with 217 additions and 91 deletions.
23 changes: 19 additions & 4 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,25 @@

// Features to add to the dev container. More info: https://containers.dev/features.
"features": {
"ghcr.io/devcontainers/features/common-utils:2": {},
"ghcr.io/devcontainers-contrib/features/poetry:2": {},
"ghcr.io/devcontainers-contrib/features/nox:2": {},
"ghcr.io/devcontainers-contrib/features/pre-commit:2": {}
"ghcr.io/devcontainers/features/common-utils:2": {
"installZsh": true,
"installOhMyZsh": true,
"installOhMyZshConfig": true,
"upgradePackages": true,
"username": "devcontainer"
},
"ghcr.io/devcontainers-contrib/features/poetry:2": {
"version": "latest"
},
"ghcr.io/devcontainers-contrib/features/nox:2": {
"version": "latest"
},
"ghcr.io/devcontainers-contrib/features/pre-commit:2": {
"version": "latest"
},
"ghcr.io/devcontainers-contrib/features/mypy:2": {
"version": "latest"
}
},

// Use 'forwardPorts' to make a list of ports inside the container available locally.
Expand Down
18 changes: 13 additions & 5 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,18 @@ def ruff(session):
@nox.session
def test(session):
# Not certain this is a good approach. But it currently works.
# session.install("pytest")
# session.install("pytest-cov")

session.run("pytest", "--cov=quke", "tests/")

# test_files = session.posargs if session.posargs else []
# session.run("pytest", "--cov=quke", *test_files)
# TODO: test_files = session.posargs if session.posargs else []
# TODO: session.run("pytest", "--cov=quke", *test_files)


@nox.session
def mypy(session):
session.install("mypy")
session.run(
"mypy",
"./quke",
"--python-executable",
"/home/vscode/.cache/pypoetry/virtualenvs/quke-61FoJWY3-py3.11/bin/python",
)
2 changes: 1 addition & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

45 changes: 34 additions & 11 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "quke"
version = "0.2.0"
version = "0.2.1"
description = "Compare the answering capabilities of different LLMs - for example LlaMa, ChatGPT, Cohere, Falcon - against user provided document(s) and questions."
authors = ["Erik Oosterop"]
maintainers = ["Erik Oosterop"]
Expand Down Expand Up @@ -36,14 +36,10 @@ replicate = "^0.9.0"
mdutils = "^1.6.0"
rich = "^13.5.2"


[tool.poetry.dev-dependencies]
pytest = "^7.4.0"
requests-mock = "^1.11.0"


[tool.poetry.group.dev.dependencies]
pytest-cov = "^4.1.0"
pytest = "^7.4.0"
requests-mock = "^1.11.0"

[build-system]
requires = ["poetry-core"]
Expand All @@ -61,22 +57,45 @@ line-length = 119
select = [ # https://beta.ruff.rs/docs/rules/
"A", # prevent using keywords that clobber python builtins
"ANN", # type annotation
"ARG", # unused arguments
"B", # bugbear: security warnings
# "BLE", # blind exceptions
"C",
"C90",
# "COM", # commas
"C4", # comprehension
"C90", # McCabe complexity
"D", # pydocstyle
# "DAR", # darglint, but does not seem to be implemented at the moment
"DTZ", # date timezone
"E", # pycodestyle
"EM", # error messages
"ERA", # eradicate
"EXE", # executables
"F", # pyflakes
"FLY", # f-strings
# "G", # logging format (no f-string)
"I", # isort
"ICN", # import conventions
"INT", # gettext
"ISC", # implicit string concatenation
"N", # pep8 naming
"PERF", # performance lint
"PIE", #
"PT", # pytest style
"PTH", # use pathlib
"Q",
"Q", # quotes
"RET", # return values
"RSE", # error parenthesis
"RUF", # ruff rules
"S", # Bandit
"SIM", # simplify
"TCH", # type checking
# "TD", # TODO
"TID", # tidy imports
"TRY", # tryceratops
"T20", # print statement
"UP", # alert you when better syntax is available in your python version
"W", # pycodestyle warnings
"RUF", # the ruff developer's own rules
]

Expand All @@ -98,7 +117,7 @@ fixable = [

[tool.ruff.per-file-ignores]
"tests/**/*.py" = [
# at least this three should be fine in tests:
# at least these three should be fine in tests:
"S101", # asserts allowed in tests...
"ANN", # TODO: do not care about type annotations in tests for now
"ARG", # Unused function args -> fixtures nevertheless are functionally relevant...
Expand All @@ -107,6 +126,7 @@ fixable = [
"PLR2004", # Magic value used in comparison, ...
"S311", # Standard pseudo-random generators are not suitable for cryptographic purposes
"D", # no pydocstyle
"N", # Argument, function to lowercase
]

"noxfile.py" = [
Expand All @@ -115,4 +135,7 @@ fixable = [
]

[tool.ruff.pydocstyle]
convention = "google"
convention = "google"

[tool.mypy]
disallow_incomplete_defs = true
73 changes: 61 additions & 12 deletions quke/embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from collections import defaultdict
from dataclasses import dataclass, field
from pathlib import Path
from typing import Iterator

# [ ] TODO: PyMU is faster, PyPDF more accurate: https://github.com/py-pdf/benchmarks
from langchain.document_loaders import CSVLoader, PyMuPDFLoader, TextLoader
Expand All @@ -20,7 +21,10 @@ class DocumentLoaderDef:

ext: str = "pdf"
loader: object = PyMuPDFLoader
kwargs: defaultdict[dict] = field(default_factory=dict) # empty dict
# TODO: Remove this - kwargs: defaultdict[dict] = field(default_factory=dict) # empty dict
kwargs: dict[str, str] = field(
default_factory=lambda: defaultdict(dict)
) # empty dict


DOC_LOADERS = [
Expand All @@ -41,6 +45,9 @@ def get_loaders(src_doc_folder: str, loader: DocumentLoaderDef) -> list:
src_doc_folder: The folder of the source files.
loader: Definition of the loader. Loaders exist for example for
pdf, text and csv files.
Returns:
A list of loaders to be used to read the text from source documents.
"""
ext = loader.ext

Expand All @@ -51,15 +58,20 @@ def get_loaders(src_doc_folder: str, loader: DocumentLoaderDef) -> list:

# TODO: Problem with embedding more than 2 files at once, or some number of pages/chunks (using HF)?
# Error message does not really help. Appending in steps does work.
loaders = [
return [
loader.loader(str(pdf_name), **loader.kwargs) for pdf_name in src_file_names
]

return loaders


def get_pages_from_document(src_doc_folder: str) -> list:
"""Reads documents from the directory/folder provided and returns a list of pages and metadata."""
"""Reads documents from the directory/folder provided and returns a list of pages and metadata.
Args:
src_doc_folder: Folder containing the source documents.
Returns:
List containing one page per list item, as text.
"""
pages = []
for docloader in DOC_LOADERS:
for loader in get_loaders(src_doc_folder, docloader):
Expand All @@ -79,9 +91,20 @@ def get_pages_from_document(src_doc_folder: str) -> list:


def get_chunks_from_pages(pages: list, splitter_params: dict) -> list:
"""Splits pages into smaller chunks used for embedding."""
# for splitter args containing 'func', the yaml value is converted into a Python function.
# TODO: Security risk? Hence a safe_list of functions is provided; severly limiting flexibility.
"""Splits pages into smaller chunks used for embedding.
Args:
pages: List with page text of a document(s).
splitter_params: Dictionary with settings for splitting logic, having
keys splitter_args and splitter_import.
splitter_args are provided to the splitter function as **kwargs. Note that if a keyword
contains 'func' the value will be evaluated as a python function (only 'len' allowed).
Returns:
A list of smaller text chunks from the pages. In a next step to be used for embedding.
"""
# TODO: eval() is a security risk. Hence a safe_list of functions is provided; severely
# limiting risk and flexibility.
# TODO: The other limiting factor: any parameter containing 'func' is eval()-ed into a function reference;
# also no other parameter is.
safe_function_list = ["len"]
Expand Down Expand Up @@ -115,7 +138,22 @@ def embed(
splitter_params: dict,
write_mode: DatabaseAction = DatabaseAction.NO_OVERWRITE,
) -> int:
"""Reads documents from a provided directory, performs embedding and captures the embeddings in a vector store."""
"""Reads documents from a provided directory, performs embedding and captures the embeddings in a vector store.
Args:
src_doc_folder: Folder containing the source documents.
vectordb_location (str): Folder of vector store database.
embedding_import: Definition for embedding model.
embedding_kwargs: **kwargs to be provided to embedding class.
vectordb_import: Definition of vector store.
rate_limit: Rate limiting info. Used as a basic limiter dealing with 3rd party API limits.
splitter_params: Specifications for text splitting logic.
write_mode: Wether to OVERWRITE, APPEND or NO_OVERWRITE the vector store. NO_OVERWRITE will
not embed anything if a vector store exists at the vectordb_location.
Returns:
The number of text chunks embedded.
"""
logging.info(f"Starting to embed into VectorDB: {vectordb_location}")

# if folder does not exist, or write_mode is APPEND no need to do anything here.
Expand All @@ -131,7 +169,7 @@ def embed(
f"{vectordb_location!r}. Remove database folder, or change embedding config "
"vectorstore_write_mode to OVERWRITE or APPEND."
)
return
return 0
if (
write_mode == DatabaseAction.OVERWRITE
): # remove exising database before embedding
Expand All @@ -153,7 +191,7 @@ def embed(
)

# Use chunker to embed in chunks with a wait time in between. As a basic way to deal with some rate limiting.
def chunker(seq: list, size: int) -> list:
def chunker(seq: list, size: int) -> Iterator[list]:
return (seq[pos : pos + size] for pos in range(0, len(seq), size))

c = 0
Expand Down Expand Up @@ -181,7 +219,18 @@ def embed_these_chunks(
embedding_kwargs: dict,
vectordb_import: ClassImportDefinition,
) -> int:
"""Embed the provided chunks and capture into a vector store."""
"""Embed the provided chunks and capture into a vector store.
Args:
chunks: List of text chunks to be embedded.
vectordb_location: Location of the folder containing the embedding database.
embedding_import: Definition of embedding model ('to build Python import statement').
embedding_kwargs: Dictionary provided as **kwargs for embedding class.
vectordb_import: Definition of vector store ('to build Python import statement').
Returns:
Number of chunks embedded and captured in vector store.
"""
module = importlib.import_module(embedding_import.module_name)
class_ = getattr(module, embedding_import.class_name)
embedding = class_(**embedding_kwargs)
Expand Down
Loading

0 comments on commit e4b6985

Please sign in to comment.