Use Jinja for output files (#3)

* Add jinja to Poetry environment. * Working base version of HTML Jinja template. * Template fromatting. Save to file. * Use Jinja for logging output. Prepare for other (Markdown) output file. * Remove use of mdutils package (replaced by Jinja). * .html output file is OK. * Ad markdown template (output file). * Version bump * Remove mdutils from pyproject.toml and poetry.lock. * version (patch) bump
EJOOSTEROP · Aug 31, 2023 · 375de86 · 375de86
1 parent e4b6985
commit 375de86
Show file tree

Hide file tree

Showing 9 changed files with 197 additions and 66 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "quke"
-version = "0.2.1"
+version = "0.3.1"
 description = "Compare the answering capabilities of different LLMs - for example LlaMa, ChatGPT, Cohere, Falcon - against user provided document(s) and questions."
 authors = ["Erik Oosterop"]
 maintainers = ["Erik Oosterop"]
@@ -33,8 +33,8 @@ huggingface-hub = "^0.16.4"
 openai = "^0.27.8"
 cohere = "^4.17.0"
 replicate = "^0.9.0"
-mdutils = "^1.6.0"
 rich = "^13.5.2"
+jinja2 = "^3.1.2"
 
 [tool.poetry.group.dev.dependencies]
 pytest-cov = "^4.1.0"

diff --git a/quke/conf/embedding/huggingface.yaml b/quke/conf/embedding/huggingface.yaml
@@ -4,11 +4,11 @@ vectordb:
   vectorstore_location: vector_store/chromadb_hf_del
 
   # Possible values for vectorstore_write_mode: overwrite, no_overwrite, append
-  # This works at the vectorstore_location level. 
+  # This works at the vectorstore_location level.
   # -If the folder exists and 'no_overwrite' is specified: document will not be embedded
   # -If the folder exists and 'overwrite' is specified, all contents of the vectordb folder will be deleted and a new vectordb will be created.
   # -If set to 'append' the new embeddings will be appended to any existing vectordb. If a source document is specified twice it will be embedded twice.
-  vectorstore_write_mode: overwrite
+  vectorstore_write_mode: no_overwrite
 
 embedding:
   module_name: langchain.embeddings

diff --git a/quke/llm_chat.py b/quke/llm_chat.py
@@ -4,11 +4,11 @@
 from collections import defaultdict
 from datetime import datetime
 from pathlib import Path
+from typing import Literal
 
+from jinja2 import Environment, PackageLoader, select_autoescape
 from langchain.chains import ConversationalRetrievalChain
 from langchain.memory import ConversationBufferMemory
-from mdutils.fileutils import MarkDownFile  # type: ignore
-from mdutils.mdutils import MdUtils  # type: ignore
 
 from . import ClassImportDefinition
 
@@ -71,71 +71,72 @@ def chat(
 
     # NOTE: trial API keys may have very restrictive rules. It is plausible that you run into
     # constraints after the 2nd question.
-    for question in prompt_parameters:
-        result = qa({"question": question})
 
-        chat_output(result)
-        chat_output_to_file(result, output_file)
+    results = [qa({"question": question}) for question in prompt_parameters]
+    chat_output_to_html(
+        results, output_file
+    )  # TODO: infer output from output file name in cfg?
+    chat_output_to_html(results, output_file, output_extension=".md")
+    chat_output_to_html(results, output_file, output_extension="logging")
 
     logging.info("=======================")
 
     return qa
 
 
-def chat_output(result: dict) -> None:
-    """Logs a chat question and anwer.
+def chat_output_to_html(
+    results: list[dict],
+    output_file: dict,
+    output_extension: Literal[".html", ".md", "logging"] = ".html",
+) -> None:
+    """Write summary of chat experiment into HTML file.
 
     Args:
-        result: dict with the answer from the LLM. Expects 'question', 'answer' and 'source' keys,
-        'page' key optionally.
+        results: list of dicts with the answer from the LLM. Expects 'question', 'answer'
+        and 'source' keys; 'page' key optionally.
+        output_file: path and other information regarding the output file.
+        output_extension: .html or .md. Alteratively logging for python logging.
     """
-    logging.info("=======================")
-    logging.info(f"Q: {result['question']}")
-    logging.info(f"A: {result['answer']}")
-
-    src_docs = [doc.metadata for doc in result["source_documents"]]
-    src_docs_pages_used = dict_crosstab(src_docs, "source", "page")
-    for key, value in src_docs_pages_used.items():
-        logging.info(f"Source document: {key}, Pages used: {value}")
+    env = Environment(loader=PackageLoader("quke"), autoescape=select_autoescape())
+
+    if output_extension.lower() == ".html":
+        template_name = "chat_session.html.jinja"
+    elif output_extension.lower() == ".md":
+        template_name = "chat_session.md.jinja"
+    elif output_extension.lower() == "logging":
+        template_name = "chat_session.logging.jinja"
+    else:
+        template_name = "chat_session.html.jinja"
 
+    template = env.get_template(template_name)
+    func_dict = {"dict_crosstab": _dict_crosstab_for_jinja}
+    template.globals.update(func_dict)
 
-# TODO: Either I do not understand mdutils or it is an unfriendly package when trying to append.
-def chat_output_to_file(result: dict, output_file: dict) -> None:
-    """Populates a record of the chat with the LLM into a markdown file.
+    output = template.render(
+        chat_time=datetime.now().astimezone().strftime("%a %d-%b-%Y %H:%M %Z"),
+        llm_results=results,
+        config=output_file["conf_yaml"],
+    )
 
-    Args:
-        result: dict with the answer from the LLM. Expects 'question', 'answer' and 'source' keys,
-        'page' key optionally.
-        output_file: File name to which the record is saved.
-    """
-    first_write = not Path(output_file["path"]).is_file()
-
-    md_file = MdUtils(file_name="tmp.md")
-
-    if first_write:
-        md_file.new_header(1, "LLM Chat Session with quke")
-        md_file.write(
-            datetime.now().astimezone().strftime("%a %d-%b-%Y %H:%M %Z"), align="center"
-        )
-        md_file.new_paragraph("")
-        md_file.new_header(2, "Experiment settings", header_id="settings")
-        md_file.insert_code(output_file["conf_yaml"], language="yaml")
-        md_file.new_header(2, "Chat", header_id="chat")
+    if output_extension.lower() == "logging":
+        logging.info(output)
     else:
-        existing_text = MarkDownFile().read_file(file_name=output_file["path"])
-        md_file.new_paragraph(existing_text)
+        file_path = Path(output_file["path"]).with_suffix(output_extension)
+        with file_path.open("w") as fp:
+            fp.write(output)
 
-    md_file.new_paragraph(f"Q: {result['question']}")
-    md_file.new_paragraph(f"A: {result['answer']}")
 
-    src_docs = [doc.metadata for doc in result["source_documents"]]
-    src_docs_pages_used = dict_crosstab(src_docs, "source", "page")
-    for key, value in src_docs_pages_used.items():
-        md_file.new_paragraph(f"Source document: {key}, Pages used: {value}")
+def _dict_crosstab_for_jinja(sources: list) -> dict:
+    """Wrapper around dict_crostab for use from within Jinja.
 
-    new = MarkDownFile(name=output_file["path"])
+    Args:
+        sources (list): _description_
 
-    new.append_end((md_file.get_md_text()).strip())
+    Returns:
+        dict: _description_
+    """
+    src_docs = [doc.metadata for doc in sources]
+    return dict_crosstab(src_docs, "source", "page")
 
 
 def dict_crosstab(source: list, key: str, listed: str, missing: str = "NA") -> dict:

diff --git a/quke/quke.py b/quke/quke.py
@@ -166,7 +166,7 @@ def quke(cfg: DictConfig) -> None:
     with console.status("Embedding...", spinner="aesthetic"):
         # python -m rich.spinner to see options
         embed.embed(**embed_parameters)
-        logging.info("\n" + OmegaConf.to_yaml(cfg))
+        # Used to log config here: logging.info("\n" + OmegaConf.to_yaml(cfg))
 
     if not config_parser.embed_only:
         with console.status("Chatting...", spinner="aesthetic"):

diff --git a/quke/templates/chat_session.html.jinja b/quke/templates/chat_session.html.jinja
@@ -0,0 +1,31 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+    <title>LLM Chat Session with quke</title>
+
+</head>
+<body>
+    <h1>LLM Chat Session with quke</h1>
+    <div>{{ chat_time }}</div>
+    <h1>Experiment Settings</h1>
+    <pre>
+        <code>{{ config }}</code>
+    </pre>
+    <h1>Chat</h1>
+    {% for result in llm_results %}
+        <div>Q: <strong>{{ result.question }}</strong></div>
+        <div>A: {{ result.answer }}</div>
+        <br>
+        <div>Source: </div>
+        <div>
+            {% for key, value in dict_crosstab(result.source_documents).items() %}
+            {{ key }}, pages: {{ value }}
+            {% endfor %}
+        </div>
+        <div><br></div>
+    {% endfor %}
+</body>
+
+{# 1) timestamp 2) conf summary 3) chat: [question, answer, source (optional)] #}
+</html>
diff --git a/quke/templates/chat_session.logging.jinja b/quke/templates/chat_session.logging.jinja
@@ -0,0 +1,8 @@
+=======================
+{{ config }}
+{% for result in llm_results %}
+Q: {{ result.question }}
+A: {{ result.answer }}
+Source: {% for key, value in dict_crosstab(result.source_documents).items() %}
+    document: {{ key }}, page: {{ value }} {% endfor %}
+{% endfor %}=======================
diff --git a/quke/templates/chat_session.md.jinja b/quke/templates/chat_session.md.jinja
@@ -0,0 +1,22 @@
+# LLM Chat Session with quke
+<center>{{ chat_time }}</center>
+
+## Experiment settings
+
+```yaml
+{{ config }}
+```
+
+## Chat
+
+{% for result in llm_results %}
+Q: {{ result.question }}
+
+A: {{ result.answer }}
+
+Source: {% for key, value in dict_crosstab(result.source_documents).items() %}
+{{ key }}, pages: {{ value }}
+{% endfor %}
+-------
+
+{% endfor %}
diff --git a/tests/test_001.py b/tests/test_001.py
@@ -108,7 +108,10 @@ def test_chat(GetConfigLLMOnly: DictConfig):
 
     chat_result = chat(**ConfigParser(GetConfigLLMOnly).get_chat_params())
     assert isinstance(chat_result, ConversationalRetrievalChain)
-    assert Path(ConfigParser(GetConfigLLMOnly).output_file).is_file()
+    assert (
+        Path(ConfigParser(GetConfigLLMOnly).output_file).with_suffix(".html").is_file()
+        or Path(ConfigParser(GetConfigLLMOnly).output_file).with_suffix(".md").is_file()
+    )
 
 
 def test_crosstab_dict(GetCrossTabDicts: list):