Merge branch 'feat-rag-add-pdf' into staging

etalab-ia · Oct 17, 2024 · a9244a8 · a9244a8
2 parents 26ed9f6 + 26becab
commit a9244a8
Show file tree

Hide file tree

Showing 11 changed files with 275 additions and 39 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,8 +3,8 @@
 # SPDX-License-Identifier: CC0-1.0
 
 # Project-specific
-store
-session.txt
+data/store
+data/session.txt
 
 *.pyc
 __pycache__

diff --git a/README.md b/README.md
@@ -62,6 +62,32 @@ Pour lancer le bot executez :
 python app
 ```
 
+#### NOTE 1
+
+Cette commande stoppera surement si vous ne la lancez pas en mode sudo car
+elle installe par défault le data/store et le data/session.txt à la racine "/".
+Vous pouvez lancer l'application pour qu'elle crée ces fichiers dans le dossier du projet directement avec la commande :
+
+```bash
+export STORE_PATH='./data/store/' && export SESSION_PATH='./data/session.txt' && python app
+```
+
+#### NOTE 2
+
+Si vous voulez développez tout en faisant que le bot reload automatiquement, vous pouvez utiliser par exemple [nodemon](https://github.com/python-nodemon/nodemon) en module global python et lancer la commande suivante dans un terminal :
+
+```bash
+nodemon --watch app --ext py --exec "export STORE_PATH='./data/store/' && export SESSION_PATH='./data/session.txt' && python app"
+```
+
+#### NOTE 3
+
+Si vous voulez que vos messages engendrés par le bot se distinguent des autres messages, possiblement envoyé par d'autres bots (comme celui de staging):
+
+```bash
+nodemon --watch app --ext py --exec "export MESSAGE_PREFIX='[DEV]' && export STORE_PATH='./data/store/' && export SESSION_PATH='./data/session.txt' && python app"
+```
+
 
 ### Troubleshooting
 

diff --git a/app/bot_msg.py b/app/bot_msg.py
@@ -13,6 +13,7 @@ class AlbertMsg:
     shorts = {
         "help": f"Pour retrouver ce message informatif, tapez `{COMMAND_PREFIX}aide`. Pour les geek tapez `{COMMAND_PREFIX}aide -v`.",
         "reset": f"Pour ré-initialiser notre conversation, tapez `{COMMAND_PREFIX}reset`",
+        "collections": f"Pour modifier l'ensemble des collections utilisées quand vous me posez une question, tapez `{COMMAND_PREFIX}collections list/use/unuse COLLECTION_NAME/all`",
         "conversation": f"Pour activer/désactiver le mode conversation, tapez `{COMMAND_PREFIX}conversation`",
         "debug": f"Pour afficher des informations sur la configuration actuelle, `{COMMAND_PREFIX}debug`",
         "model": f"Pour modifier le modèle, tapez `{COMMAND_PREFIX}model MODEL_NAME`",

diff --git a/app/commands.py b/app/commands.py
@@ -13,18 +13,27 @@
 from matrix_bot.client import MatrixClient
 from matrix_bot.config import logger
 from matrix_bot.eventparser import EventNotConcerned, EventParser
-from nio import Event, RoomMemberEvent, RoomMessageFile, RoomMessageText
+from nio import Event, RoomEncryptedFile, RoomMemberEvent, RoomMessageText
 
 from bot_msg import AlbertMsg
 from config import COMMAND_PREFIX, Config
 from core_llm import (
+    delete_collections_with_name,
+    get_all_public_collections,
+    get_or_create_collection_with_name,
     generate,
     get_available_models,
     get_available_modes,
+    upload_file,
 )
 from iam import TchapIam
-from tchap_utils import get_cleanup_body, get_previous_messages, get_thread_messages, isa_reply_to
-
+from tchap_utils import (
+    get_cleanup_body, 
+    get_decrypted_file,
+    get_previous_messages, 
+    get_thread_messages, 
+    isa_reply_to
+)
 
 @dataclass
 class CommandRegistry:
@@ -328,7 +337,6 @@ async def albert_model(ep: EventParser, matrix_client: MatrixClient):
 @only_allowed_user
 async def albert_mode(ep: EventParser, matrix_client: MatrixClient):
     config = user_configs[ep.sender]
-    await matrix_client.room_typing(ep.room.room_id)
     command = ep.get_command()
     # Get all available mode for the current model
     all_modes = get_available_modes(config)
@@ -347,8 +355,18 @@ async def albert_mode(ep: EventParser, matrix_client: MatrixClient):
             old_mode = config.albert_mode
             config.albert_mode = mode
             message = f"Le mode a été modifié : {old_mode} -> {mode}"
+
     await matrix_client.send_markdown_message(ep.room.room_id, message, msgtype="m.notice")
 
+    if mode == "norag":
+        message = "Nettoyage des collections RAG propres à cette conversation..."
+        await matrix_client.send_markdown_message(ep.room.room_id, message, msgtype="m.notice")  
+        await matrix_client.room_typing(ep.room.room_id)
+        delete_collections_with_name(config, ep.room.room_id)
+        config.albert_collections_by_id = {}
+        message = "Nettoyage des collections RAG terminé."
+        await matrix_client.send_markdown_message(ep.room.room_id, message, msgtype="m.notice")  
+
 
 @register_feature(
     group="albert",
@@ -361,15 +379,13 @@ async def albert_sources(ep: EventParser, matrix_client: MatrixClient):
     config = user_configs[ep.sender]
 
     try:
-        if config.last_rag_sources:
+        if config.last_rag_chunks:
             await matrix_client.room_typing(ep.room.room_id)
-            sources = config.last_rag_sources
             sources_msg = ""
-            for source in sources:
-                extra_context = ""
-                if source.get("context"):
-                    extra_context = f'({source["context"]})'
-                sources_msg += f'- {source["title"]} {extra_context}: {source["url"]} \n'
+            for chunk in config.last_rag_chunks[:max(30, len(config.last_rag_chunks))]:
+                sources_msg += f'________________________________________\n'
+                sources_msg += f'####{chunk["metadata"]["document_name"]}\n'
+                sources_msg += f'{chunk["content"]}\n'
         else:
             sources_msg = "Aucune source trouvée, veuillez me poser une question d'abord."
     except Exception:
@@ -404,6 +420,91 @@ async def albert_document(ep: EventParser, matrix_client: MatrixClient):
         )
         await matrix_client.send_markdown_message(ep.room.room_id, response)
 
+@register_feature(
+    group="albert",
+    onEvent=RoomMessageText,
+    command="collections",
+    help=AlbertMsg.shorts["collections"],
+)
+@only_allowed_user
+async def albert_collection(ep: EventParser, matrix_client: MatrixClient):
+    config = user_configs[ep.sender]
+    await matrix_client.room_typing(ep.room.room_id)
+    command = ep.get_command()
+    if len(command) <= 1:
+        message = f"La commande !collection nécessite de donner list/use/unuse puis <nom_de_collection>/{config.albert_all_public_command} :"
+        message += "\n\nExemple: `!collection use ma_collection`"
+    elif command[1] != 'list' and len(command) <= 2:
+        if command[1] not in ['use', 'unuse']:
+            message = f"La commande !collection {command[1]} n'est pas reconnue, seul list/use/unuse sont autorisés"
+        else:
+            message = f"La commande !collection {command[1]} nécessite de donner en plus COLLECTION_NAME/{config.albert_all_public_command} :"
+            message += "\n\nExemple: `!collection use ma_collection`"
+    else:
+        method = command[1]
+        if method == 'list':
+            collection_names = ','.join([c['name'] for c in config.albert_collections_by_id.values()])
+            if collection_names == '':
+                message = "Vous n'avez pas de collections enregistrées pour le moment qui pourraient m'aider à répondre à vos questions."
+            else:
+                message = f"Les collections {collection_names} sont disponibles pour vos questions."
+        elif method == 'use':
+            collections = get_all_public_collections(config) if (command[2] == config.albert_all_public_command) else \
+                    [get_or_create_collection_with_name(config, command[2])]
+            for collection in collections:
+                config.albert_collections_by_id[collection["id"]] = collection
+            message = f"Les collections {collection_names} sont ajoutées à vos collections."
+        else:
+            collection_names = ','.join([c['name'] for c in config.albert_collections_by_id.values()])
+            config.albert_collections_by_id = {}
+            if collection_names == '':
+                message = "Il n'y avait pas de collections à retirer."
+            else:
+                message = f"Les collections {collection_names} sont retirées de vos collections."
+    await matrix_client.send_markdown_message(ep.room.room_id, message, msgtype="m.notice")
+
+@register_feature(
+    group="albert",
+    onEvent=RoomEncryptedFile,
+    help=None
+)
+@only_allowed_user
+async def albert_document(ep: EventParser, matrix_client: MatrixClient):
+    config = user_configs[ep.sender]
+
+    try:
+        await matrix_client.room_typing(ep.room.room_id)
+        if ep.event.mimetype in ['application/json', 'application/pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document']:
+            config.update_last_activity()       
+            config.albert_mode = "rag"
+            collection = get_or_create_collection_with_name(config, ep.room.room_id)
+            config.albert_collections_by_id[collection["id"]] = collection
+            file = await get_decrypted_file(ep)
+            upload_file(config, file, collection['id'])
+            response = (
+                f"Votre document a été chargé dans la collection temporaire {collection['id']}."
+                "Maintenant, si vous discutez avec moi, "
+                "je tiendrai compte de ce document pour répondre. "
+                "Vous pouvez taper '!mode norag' pour faire que la conversation ne tienne plus compte de ce document."
+            )
+        else:
+            response = (
+                f"J'ai détecté que vous avez téléchargé un fichier {ep.event.mimetype}. "
+                "Ce fichier n'est pris en charge par Albert. "
+                "Veuillez téléverser un fichier PDF, DOCX ou JSON."
+            )
+        await matrix_client.send_markdown_message(ep.room.room_id, response, msgtype="m.notice")
+
+    except Exception as albert_err:
+        logger.error(f"{albert_err}")
+        traceback.print_exc()
+        await matrix_client.send_markdown_message(ep.room.room_id, AlbertMsg.failed, msgtype="m.notice")
+        if config.errors_room_id:
+            try:
+                await matrix_client.send_markdown_message(config.errors_room_id, AlbertMsg.error_debug(albert_err, config))
+            except:
+                print("Failed to find error room ?!")
+
 @register_feature(
     group="albert",
     onEvent=RoomMessageText,
@@ -429,6 +530,8 @@ async def albert_answer(ep: EventParser, matrix_client: MatrixClient):
         await matrix_client.send_markdown_message(
             ep.room.room_id, reset_message, msgtype="m.notice"
         )
+        delete_collections_with_name(config, ep.room.room_id)
+        config.albert_collections_by_id = {}
 
     config.update_last_activity()
     await matrix_client.room_typing(ep.room.room_id)

diff --git a/app/config.py b/app/config.py
@@ -55,6 +55,7 @@ class Config(BaseConfig):
     # ============================
     # PER USER SETTINGS !
     # ============================
+    albert_collections_by_id: dict[str, dict] = Field({}, description="Collections to use for Albert API chat completion with RAG")
     albert_model: str = Field(
         "AgentPublic/albertlight-7b",
         description="Albert model name to use (see Albert models hub on HuggingFace)",
@@ -64,10 +65,11 @@ class Config(BaseConfig):
     albert_with_history: bool = Field(True, description="Conversational mode")
     albert_history_lookup: int = Field(0, description="How far we lookup in the history")
     albert_max_rewind: int = Field(20, description="Max history rewind for stability purposes")
+    albert_all_public_command: str = Field("__all_public__", description="Command to use to get all public collections")
     conversation_obsolescence: int = Field(
         15 * 60, description="time after which a conversation is considered obsolete, in seconds"
     )
-    last_rag_sources: list[dict] | None = Field(None, description="Last sources used for the RAG.")
+    last_rag_chunks: list[dict] | None = Field(None, description="Last chunks used for the RAG.")
 
     @property
     def is_conversation_obsolete(self) -> bool: