Skip to content

Commit

Permalink
Use Jinja for output files (#3)
Browse files Browse the repository at this point in the history
* Add jinja to Poetry environment.

* Working base version of HTML Jinja template.

* Template fromatting. Save to file.

* Use Jinja for logging output. Prepare for other (Markdown) output file.

* Remove use of mdutils package (replaced by Jinja).

* .html output file is OK.

* Ad markdown template (output file).

* Version bump

* Remove mdutils from pyproject.toml and poetry.lock.

* version (patch) bump
  • Loading branch information
EJOOSTEROP authored Aug 31, 2023
1 parent e4b6985 commit 375de86
Show file tree
Hide file tree
Showing 9 changed files with 197 additions and 66 deletions.
88 changes: 77 additions & 11 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "quke"
version = "0.2.1"
version = "0.3.1"
description = "Compare the answering capabilities of different LLMs - for example LlaMa, ChatGPT, Cohere, Falcon - against user provided document(s) and questions."
authors = ["Erik Oosterop"]
maintainers = ["Erik Oosterop"]
Expand Down Expand Up @@ -33,8 +33,8 @@ huggingface-hub = "^0.16.4"
openai = "^0.27.8"
cohere = "^4.17.0"
replicate = "^0.9.0"
mdutils = "^1.6.0"
rich = "^13.5.2"
jinja2 = "^3.1.2"

[tool.poetry.group.dev.dependencies]
pytest-cov = "^4.1.0"
Expand Down
4 changes: 2 additions & 2 deletions quke/conf/embedding/huggingface.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ vectordb:
vectorstore_location: vector_store/chromadb_hf_del

# Possible values for vectorstore_write_mode: overwrite, no_overwrite, append
# This works at the vectorstore_location level.
# This works at the vectorstore_location level.
# -If the folder exists and 'no_overwrite' is specified: document will not be embedded
# -If the folder exists and 'overwrite' is specified, all contents of the vectordb folder will be deleted and a new vectordb will be created.
# -If set to 'append' the new embeddings will be appended to any existing vectordb. If a source document is specified twice it will be embedded twice.
vectorstore_write_mode: overwrite
vectorstore_write_mode: no_overwrite

embedding:
module_name: langchain.embeddings
Expand Down
99 changes: 50 additions & 49 deletions quke/llm_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
from collections import defaultdict
from datetime import datetime
from pathlib import Path
from typing import Literal

from jinja2 import Environment, PackageLoader, select_autoescape
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from mdutils.fileutils import MarkDownFile # type: ignore
from mdutils.mdutils import MdUtils # type: ignore

from . import ClassImportDefinition

Expand Down Expand Up @@ -71,71 +71,72 @@ def chat(

# NOTE: trial API keys may have very restrictive rules. It is plausible that you run into
# constraints after the 2nd question.
for question in prompt_parameters:
result = qa({"question": question})

chat_output(result)
chat_output_to_file(result, output_file)
results = [qa({"question": question}) for question in prompt_parameters]
chat_output_to_html(
results, output_file
) # TODO: infer output from output file name in cfg?
chat_output_to_html(results, output_file, output_extension=".md")
chat_output_to_html(results, output_file, output_extension="logging")

logging.info("=======================")

return qa


def chat_output(result: dict) -> None:
"""Logs a chat question and anwer.
def chat_output_to_html(
results: list[dict],
output_file: dict,
output_extension: Literal[".html", ".md", "logging"] = ".html",
) -> None:
"""Write summary of chat experiment into HTML file.
Args:
result: dict with the answer from the LLM. Expects 'question', 'answer' and 'source' keys,
'page' key optionally.
results: list of dicts with the answer from the LLM. Expects 'question', 'answer'
and 'source' keys; 'page' key optionally.
output_file: path and other information regarding the output file.
output_extension: .html or .md. Alteratively logging for python logging.
"""
logging.info("=======================")
logging.info(f"Q: {result['question']}")
logging.info(f"A: {result['answer']}")

src_docs = [doc.metadata for doc in result["source_documents"]]
src_docs_pages_used = dict_crosstab(src_docs, "source", "page")
for key, value in src_docs_pages_used.items():
logging.info(f"Source document: {key}, Pages used: {value}")
env = Environment(loader=PackageLoader("quke"), autoescape=select_autoescape())

if output_extension.lower() == ".html":
template_name = "chat_session.html.jinja"
elif output_extension.lower() == ".md":
template_name = "chat_session.md.jinja"
elif output_extension.lower() == "logging":
template_name = "chat_session.logging.jinja"
else:
template_name = "chat_session.html.jinja"

template = env.get_template(template_name)
func_dict = {"dict_crosstab": _dict_crosstab_for_jinja}
template.globals.update(func_dict)

# TODO: Either I do not understand mdutils or it is an unfriendly package when trying to append.
def chat_output_to_file(result: dict, output_file: dict) -> None:
"""Populates a record of the chat with the LLM into a markdown file.
output = template.render(
chat_time=datetime.now().astimezone().strftime("%a %d-%b-%Y %H:%M %Z"),
llm_results=results,
config=output_file["conf_yaml"],
)

Args:
result: dict with the answer from the LLM. Expects 'question', 'answer' and 'source' keys,
'page' key optionally.
output_file: File name to which the record is saved.
"""
first_write = not Path(output_file["path"]).is_file()

md_file = MdUtils(file_name="tmp.md")

if first_write:
md_file.new_header(1, "LLM Chat Session with quke")
md_file.write(
datetime.now().astimezone().strftime("%a %d-%b-%Y %H:%M %Z"), align="center"
)
md_file.new_paragraph("")
md_file.new_header(2, "Experiment settings", header_id="settings")
md_file.insert_code(output_file["conf_yaml"], language="yaml")
md_file.new_header(2, "Chat", header_id="chat")
if output_extension.lower() == "logging":
logging.info(output)
else:
existing_text = MarkDownFile().read_file(file_name=output_file["path"])
md_file.new_paragraph(existing_text)
file_path = Path(output_file["path"]).with_suffix(output_extension)
with file_path.open("w") as fp:
fp.write(output)

md_file.new_paragraph(f"Q: {result['question']}")
md_file.new_paragraph(f"A: {result['answer']}")

src_docs = [doc.metadata for doc in result["source_documents"]]
src_docs_pages_used = dict_crosstab(src_docs, "source", "page")
for key, value in src_docs_pages_used.items():
md_file.new_paragraph(f"Source document: {key}, Pages used: {value}")
def _dict_crosstab_for_jinja(sources: list) -> dict:
"""Wrapper around dict_crostab for use from within Jinja.
new = MarkDownFile(name=output_file["path"])
Args:
sources (list): _description_
new.append_end((md_file.get_md_text()).strip())
Returns:
dict: _description_
"""
src_docs = [doc.metadata for doc in sources]
return dict_crosstab(src_docs, "source", "page")


def dict_crosstab(source: list, key: str, listed: str, missing: str = "NA") -> dict:
Expand Down
2 changes: 1 addition & 1 deletion quke/quke.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def quke(cfg: DictConfig) -> None:
with console.status("Embedding...", spinner="aesthetic"):
# python -m rich.spinner to see options
embed.embed(**embed_parameters)
logging.info("\n" + OmegaConf.to_yaml(cfg))
# Used to log config here: logging.info("\n" + OmegaConf.to_yaml(cfg))

if not config_parser.embed_only:
with console.status("Chatting...", spinner="aesthetic"):
Expand Down
31 changes: 31 additions & 0 deletions quke/templates/chat_session.html.jinja
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
<!DOCTYPE html>
<html lang="en">

<head>
<title>LLM Chat Session with quke</title>

</head>
<body>
<h1>LLM Chat Session with quke</h1>
<div>{{ chat_time }}</div>
<h1>Experiment Settings</h1>
<pre>
<code>{{ config }}</code>
</pre>
<h1>Chat</h1>
{% for result in llm_results %}
<div>Q: <strong>{{ result.question }}</strong></div>
<div>A: {{ result.answer }}</div>
<br>
<div>Source: </div>
<div>
{% for key, value in dict_crosstab(result.source_documents).items() %}
{{ key }}, pages: {{ value }}
{% endfor %}
</div>
<div><br></div>
{% endfor %}
</body>

{# 1) timestamp 2) conf summary 3) chat: [question, answer, source (optional)] #}
</html>
8 changes: 8 additions & 0 deletions quke/templates/chat_session.logging.jinja
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
=======================
{{ config }}
{% for result in llm_results %}
Q: {{ result.question }}
A: {{ result.answer }}
Source: {% for key, value in dict_crosstab(result.source_documents).items() %}
document: {{ key }}, page: {{ value }} {% endfor %}
{% endfor %}=======================
22 changes: 22 additions & 0 deletions quke/templates/chat_session.md.jinja
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# LLM Chat Session with quke
<center>{{ chat_time }}</center>

## Experiment settings

```yaml
{{ config }}
```

## Chat

{% for result in llm_results %}
Q: {{ result.question }}

A: {{ result.answer }}

Source: {% for key, value in dict_crosstab(result.source_documents).items() %}
{{ key }}, pages: {{ value }}
{% endfor %}
-------

{% endfor %}
5 changes: 4 additions & 1 deletion tests/test_001.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,10 @@ def test_chat(GetConfigLLMOnly: DictConfig):

chat_result = chat(**ConfigParser(GetConfigLLMOnly).get_chat_params())
assert isinstance(chat_result, ConversationalRetrievalChain)
assert Path(ConfigParser(GetConfigLLMOnly).output_file).is_file()
assert (
Path(ConfigParser(GetConfigLLMOnly).output_file).with_suffix(".html").is_file()
or Path(ConfigParser(GetConfigLLMOnly).output_file).with_suffix(".md").is_file()
)


def test_crosstab_dict(GetCrossTabDicts: list):
Expand Down

0 comments on commit 375de86

Please sign in to comment.