Merge branch 'staging'

etalab-ia · Nov 12, 2024 · 95f0126 · 95f0126
2 parents ea80769 + 364333a
commit 95f0126
Show file tree

Hide file tree

Showing 13 changed files with 624 additions and 74 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,8 +3,8 @@
 # SPDX-License-Identifier: CC0-1.0
 
 # Project-specific
-store
-session.txt
+data/store
+data/session.txt
 
 *.pyc
 __pycache__

diff --git a/README.md b/README.md
@@ -62,6 +62,41 @@ Pour lancer le bot executez :
 python app
 ```
 
+#### NOTE 1
+
+Cette commande stoppera surement si vous ne la lancez pas en mode sudo car
+elle installe par défault le data/store et le data/session.txt à la racine "/".
+Vous pouvez lancer l'application pour qu'elle crée ces fichiers dans le dossier du projet directement avec la commande :
+
+```bash
+export STORE_PATH='./data/store/' && export SESSION_PATH='./data/session.txt' && python app
+```
+
+#### NOTE 2
+
+Si vous voulez développez tout en faisant que le bot reload automatiquement, vous pouvez utiliser par exemple [nodemon](https://github.com/python-nodemon/nodemon) en module global python et lancer la commande suivante dans un terminal :
+
+```bash
+nodemon --watch app --ext py --exec "export STORE_PATH='./data/store/' && export SESSION_PATH='./data/session.txt' && python app"
+```
+
+#### NOTE 3
+
+Si vous voulez que vos messages engendrés par le bot se distinguent des autres messages, possiblement envoyé par d'autres bots (comme celui de staging):
+
+```bash
+nodemon --watch app --ext py --exec "export MESSAGE_PREFIX='[DEV]' && export STORE_PATH='./data/store/' && export SESSION_PATH='./data/session.txt' && python app"
+```
+
+#### NOTE 4
+
+Si vous voulez merger votre branche de dev pour la tester sur beta.tchap (branche staging) :
+
+```bash
+git checkout staging
+git merge <your-branch>
+git push origin staging
+```
 
 ### Troubleshooting
 

diff --git a/app/bot_msg.py b/app/bot_msg.py
@@ -13,6 +13,7 @@ class AlbertMsg:
     shorts = {
         "help": f"Pour retrouver ce message informatif, tapez `{COMMAND_PREFIX}aide`. Pour les geek tapez `{COMMAND_PREFIX}aide -v`.",
         "reset": f"Pour ré-initialiser notre conversation, tapez `{COMMAND_PREFIX}reset`",
+        "collections": f"Pour modifier l'ensemble des collections utilisées quand vous me posez une question, tapez `{COMMAND_PREFIX}collections list/use/unuse/info COLLECTION_NAME/{Config().albert_all_public_command}`",
         "conversation": f"Pour activer/désactiver le mode conversation, tapez `{COMMAND_PREFIX}conversation`",
         "debug": f"Pour afficher des informations sur la configuration actuelle, `{COMMAND_PREFIX}debug`",
         "model": f"Pour modifier le modèle, tapez `{COMMAND_PREFIX}model MODEL_NAME`",
@@ -22,6 +23,10 @@ class AlbertMsg:
 
     failed = "🤖 Albert a échoué à répondre. Veuillez réessayez dans un moment."
 
+    flush_start = "Nettoyage des collections RAG propres à cette conversation..."
+
+    flush_end = "Nettoyage des collections RAG terminé."
+
     reset = "**La conversation a été remise à zéro**. Vous pouvez néanmoins toujours répondre dans un fil de discussion."
 
     user_not_allowed = "Albert est en phase de test et n'est pas encore disponible pour votre utilisateur. Contactez [email protected] pour demander un accès."
@@ -52,7 +57,7 @@ def help(model_url, model_short_name, cmds):
 
     def commands(cmds):
         msg = "Les commandes spéciales suivantes sont disponibles :\n\n"
-        msg += "- " + "\n- ".join(cmds) # type: ignore
+        msg += "- " + "\n- ".join(cmds)  # type: ignore
         return msg
 
     def unknown_command(cmds_msg):

diff --git a/app/commands.py b/app/commands.py
@@ -12,19 +12,29 @@
 from matrix_bot.client import MatrixClient
 from matrix_bot.config import logger
 from matrix_bot.eventparser import EventNotConcerned, EventParser
-from nio import Event, RoomMemberEvent, RoomMessageText
+from nio import Event, RoomEncryptedFile, RoomMemberEvent, RoomMessageText
 
 from bot_msg import AlbertMsg
 from config import COMMAND_PREFIX, Config
 from core_llm import (
+    flush_collections_with_name,
+    get_all_public_collections,
+    get_or_create_collection_with_name,
+    get_or_not_collection_with_name,
+    get_documents,
     generate,
-    generate_sources,
     get_available_models,
     get_available_modes,
+    upload_file,
 )
 from iam import TchapIam
-from tchap_utils import get_cleanup_body, get_previous_messages, get_thread_messages, isa_reply_to
-
+from tchap_utils import (
+    get_cleanup_body, 
+    get_decrypted_file,
+    get_previous_messages, 
+    get_thread_messages, 
+    isa_reply_to
+)
 
 @dataclass
 class CommandRegistry:
@@ -163,15 +173,17 @@ async def wrapper(ep: EventParser, matrix_client: MatrixClient):
 
         config = user_configs[ep.sender]
         is_allowed, msg = await tiam.is_user_allowed(config, ep.sender, refresh=True)
-        if is_allowed:
-            return await func(ep, matrix_client)
+        if not is_allowed:
+            if not msg or ep.is_command(COMMAND_PREFIX):
+                # Only send back the message for the generic albert_answer method
+                # ignoring other callbacks.
+                raise EventNotConcerned
 
-        if not msg or ep.is_command(COMMAND_PREFIX):
-            # Only send back the message for the generic albert_answer method
-            # ignoring other callbacks.
-            raise EventNotConcerned
+            await log_not_allowed(msg, ep, matrix_client)
+            return
 
-        await log_not_allowed(msg, ep, matrix_client)
+        await func(ep, matrix_client)
+        await matrix_client.room_typing(ep.room.room_id, typing_state=False)
 
     return wrapper
 
@@ -239,6 +251,15 @@ async def albert_reset(ep: EventParser, matrix_client: MatrixClient):
         await matrix_client.send_markdown_message(
             ep.room.room_id, reset_message, msgtype="m.notice"
         )
+
+        message = AlbertMsg.flush_start
+        await matrix_client.send_markdown_message(ep.room.room_id, message, msgtype="m.notice")  
+        await matrix_client.room_typing(ep.room.room_id)
+        flush_collections_with_name(config, ep.room.room_id)
+        config.albert_collections_by_id = {}
+        message = AlbertMsg.flush_end
+        await matrix_client.send_markdown_message(ep.room.room_id, message, msgtype="m.notice")  
+
     else:
         await matrix_client.send_markdown_message(
             ep.room.room_id,
@@ -296,8 +317,7 @@ async def albert_model(ep: EventParser, matrix_client: MatrixClient):
     await matrix_client.room_typing(ep.room.room_id)
     command = ep.get_command()
     # Get all available models
-    all_models = get_available_models(config)
-    all_models = [k for k, v in all_models.items() if v["type"] == "text-generation"]
+    all_models = list(get_available_models(config))
     models_list = "\n\n- " + "\n- ".join(
         map(lambda x: x + (" *" if x == config.albert_model else ""), all_models)
     )
@@ -327,11 +347,9 @@ async def albert_model(ep: EventParser, matrix_client: MatrixClient):
 @only_allowed_user
 async def albert_mode(ep: EventParser, matrix_client: MatrixClient):
     config = user_configs[ep.sender]
-    await matrix_client.room_typing(ep.room.room_id)
     command = ep.get_command()
     # Get all available mode for the current model
     all_modes = get_available_modes(config)
-    all_modes += ["norag"]
     mode_list = "\n\n- " + "\n- ".join(
         map(lambda x: x + (" *" if x == config.albert_mode else ""), all_modes)
     )
@@ -347,8 +365,18 @@ async def albert_mode(ep: EventParser, matrix_client: MatrixClient):
             old_mode = config.albert_mode
             config.albert_mode = mode
             message = f"Le mode a été modifié : {old_mode} -> {mode}"
+
     await matrix_client.send_markdown_message(ep.room.room_id, message, msgtype="m.notice")
 
+    if mode == "norag":
+        message = AlbertMsg.flush_start
+        await matrix_client.send_markdown_message(ep.room.room_id, message, msgtype="m.notice")  
+        await matrix_client.room_typing(ep.room.room_id)
+        flush_collections_with_name(config, ep.room.room_id)
+        config.albert_collections_by_id = {}
+        message = AlbertMsg.flush_end
+        await matrix_client.send_markdown_message(ep.room.room_id, message, msgtype="m.notice")  
+
 
 @register_feature(
     group="albert",
@@ -361,15 +389,13 @@ async def albert_sources(ep: EventParser, matrix_client: MatrixClient):
     config = user_configs[ep.sender]
 
     try:
-        if config.last_rag_references:
+        if config.last_rag_chunks:
             await matrix_client.room_typing(ep.room.room_id)
-            sources = generate_sources(config, config.last_rag_references)
             sources_msg = ""
-            for source in sources:
-                extra_context = ""
-                if source.get("context"):
-                    extra_context = f'({source["context"]})'
-                sources_msg += f'- {source["title"]} {extra_context}: {source["url"]} \n'
+            for chunk in config.last_rag_chunks[:max(30, len(config.last_rag_chunks))]:
+                sources_msg += f'________________________________________\n'
+                sources_msg += f'####{chunk["metadata"]["document_name"]}\n'
+                sources_msg += f'{chunk["content"]}\n'
         else:
             sources_msg = "Aucune source trouvée, veuillez me poser une question d'abord."
     except Exception:
@@ -380,6 +406,144 @@ async def albert_sources(ep: EventParser, matrix_client: MatrixClient):
     await matrix_client.send_markdown_message(ep.room.room_id, sources_msg)
 
 
+@register_feature(
+    group="albert",
+    onEvent=RoomMessageText,
+    command="collections",
+    help=AlbertMsg.shorts["collections"],
+)
+@only_allowed_user
+async def albert_collection(ep: EventParser, matrix_client: MatrixClient):
+    config = user_configs[ep.sender]
+    await matrix_client.room_typing(ep.room.room_id)
+    command = ep.get_command()
+    if len(command) <= 1:
+        message = f"La commande !collections nécessite de donner list/use/unuse/info puis éventuellement <nom_de_collection>/{config.albert_all_public_command} :"
+        message += "\n\nExemple: `!collections use decisions-adlc`"
+    elif command[1] != 'list' and len(command) <= 2:
+        if command[1] not in ['use', 'unuse']:
+            message = f"La commande !collections {command[1]} n'est pas reconnue, seul list/use/unuse sont autorisés"
+        else:
+            message = f"La commande !collections {command[1]} nécessite de donner en plus COLLECTION_NAME/{config.albert_all_public_command} :"
+            message += "\n\nExemple: `!collections use decisions-adlc`"
+    else:
+        method = command[1]
+        if method == 'list':
+            collections = config.albert_collections_by_id.values()
+            collection_display_names = [c['name'] if c['name'] != ep.room.room_id else config.albert_my_private_collection_name for c in collections]
+            collection_ids = [c['id'] for c in collections]
+            collection_infos = '\n - ' + '\n - '.join([f"{display_name}" for display_name, collection_id in zip(collection_display_names, collection_ids)])
+            if not collections:
+                message = "Vous n'avez pas de collections enregistrées pour le moment qui pourraient m'aider à répondre à vos questions."
+            else:
+                message = (
+                    "Les collections :\n"
+                    f"{collection_infos}\n\n"
+                    "sont prises en compte pour m'aider à répondre à vos questions."
+                )
+            collections = get_all_public_collections(config)
+            message += "\n\nNotez que les collections publiques à votre disposition sont:\n"
+            message += '\n - ' + '\n - '.join([f"{c['name']}" for c in collections])
+            message += f"\n\nVous pouvez toutes les ajouter d'un coup en utilisant la commande `!collections use {config.albert_all_public_command}`"
+        elif method == 'info':
+            collection_name = command[2] if command[2] != config.albert_my_private_collection_name else ep.room.room_id
+            collection = get_or_not_collection_with_name(config, collection_name)
+            if not collection:
+                message = f"La collection {collection_name} n'existe pas."
+            else:
+                document_infos = [f"{d['name']} ({d['id']})" for d in get_documents(config, collection['id'])]
+                if not document_infos:
+                    message = (
+                        f"Collection '{command[2]}' ({collection['id']}) : \n\n"
+                        f"Aucun document n'est présent dans cette collection ({collection['id']})."
+                    )
+                else:
+                    document_infos_message = '\n - ' + '\n - '.join(document_infos)
+                    message = (
+                        f"Collection '{command[2]}' ({collection['id']}) : \n\n"
+                        "Voici les documents actuellement présents dans la collection : \n\n"
+                        f"{document_infos_message}"
+                        "\n\n"
+                    )
+        elif method == 'use':
+            if command[2] == config.albert_all_public_command:
+                collections = get_all_public_collections(config)
+            else:
+                collection = get_or_not_collection_with_name(config, command[2])
+                if not collection:
+                    message = f"La collection {command[2]} n'existe pas."
+                    collections = []
+                else:
+                    collections = [collection]
+            if collections:
+                collection_names = ','.join([c['name'] for c in collections])
+                for collection in collections:
+                    config.albert_collections_by_id[collection["id"]] = collection
+                collection_infos = '\n - ' + '\n - '.join([f"{c['name']}" for c in config.albert_collections_by_id.values()])
+                message = (
+                    f"Les collections {collection_names} sont ajoutées à vos collections.\n\n" if len(collections) > 1 else f"La collection {command[2]} est ajoutée à vos collections.\n\n"
+                    "Maintenant, les collections :\n"
+                    f"{collection_infos}\n\n"
+                    "sont disponibles pour m'aider à répondre à vos questions."
+                )
+        else:
+            collections = config.albert_collections_by_id.values()
+            collection_names = ','.join([c['name'] for c in collections])
+            config.albert_collections_by_id = {}
+            if not collections:
+                message = "Il n'y avait pas de collections à retirer."
+            else:
+                message = f"Les collections {collection_names} sont retirées de vos collections."
+    await matrix_client.send_markdown_message(ep.room.room_id, message, msgtype="m.notice")
+
+@register_feature(
+    group="albert",
+    onEvent=RoomEncryptedFile,
+    help=None
+)
+@only_allowed_user
+async def albert_document(ep: EventParser, matrix_client: MatrixClient):
+    config = user_configs[ep.sender]
+
+    try:
+        await matrix_client.room_typing(ep.room.room_id)
+        if ep.event.mimetype in ['application/json', 'application/pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document']:
+            config.update_last_activity()       
+            config.albert_mode = "rag"
+            collection = get_or_create_collection_with_name(config, ep.room.room_id)
+            config.albert_collections_by_id[collection['id']] = collection
+            file = await get_decrypted_file(ep)
+            upload_file(config, file, collection['id'])
+            private_document_infos = [d['name'] for d in get_documents(config, collection['id'])]
+            private_document_infos_message = '\n - ' + '\n - '.join(private_document_infos)
+            response = (
+                "Votre document : \n\n"
+                f"\"{file.name}\"\n\n"
+                "a été chargé dans votre collection privée.\n\n"
+                "Voici les documents actuellement présents dans votre collection privée : \n\n"
+                f"{private_document_infos_message}"
+                "\n\n"
+                "Je tiendrai compte de tous ces documents pour répondre. \n\n"
+                "Vous pouvez taper \"!mode norag\" pour vider votre collection privée de tous ces documents."
+            )
+        else:
+            response = (
+                f"J'ai détecté que vous avez téléchargé un fichier {ep.event.mimetype}. "
+                "Ce fichier n'est pris en charge par Albert. "
+                "Veuillez téléverser un fichier PDF, DOCX ou JSON."
+            )
+        await matrix_client.send_markdown_message(ep.room.room_id, response, msgtype="m.notice")
+
+    except Exception as albert_err:
+        logger.error(f"{albert_err}")
+        traceback.print_exc()
+        await matrix_client.send_markdown_message(ep.room.room_id, AlbertMsg.failed, msgtype="m.notice")
+        if config.errors_room_id:
+            try:
+                await matrix_client.send_markdown_message(config.errors_room_id, AlbertMsg.error_debug(albert_err, config))
+            except:
+                print("Failed to find error room ?!")
+
 @register_feature(
     group="albert",
     onEvent=RoomMessageText,
@@ -405,9 +569,11 @@ async def albert_answer(ep: EventParser, matrix_client: MatrixClient):
         await matrix_client.send_markdown_message(
             ep.room.room_id, reset_message, msgtype="m.notice"
         )
+        flush_collections_with_name(config, ep.room.room_id)
+        config.albert_collections_by_id = {}
 
     config.update_last_activity()
-    await matrix_client.room_typing(ep.room.room_id, typing_state=True, timeout=180_000)
+    await matrix_client.room_typing(ep.room.room_id)
     try:
         # Build the messages  history
         # --

diff --git a/app/config.py b/app/config.py
@@ -55,18 +55,22 @@ class Config(BaseConfig):
     # ============================
     # PER USER SETTINGS !
     # ============================
+    albert_collections_by_id: dict[str, dict] = Field({}, description="Collections to use for Albert API chat completion with RAG")
     albert_model: str = Field(
         "AgentPublic/albertlight-7b",
         description="Albert model name to use (see Albert models hub on HuggingFace)",
     )
+    albert_model_embedding: str = Field("BAAI/bge-m3", description="Embedding model (Rag, COT, etc)")
     albert_mode: str = Field("rag", description="Albert API mode")
     albert_with_history: bool = Field(True, description="Conversational mode")
     albert_history_lookup: int = Field(0, description="How far we lookup in the history")
     albert_max_rewind: int = Field(20, description="Max history rewind for stability purposes")
+    albert_my_private_collection_name: str = Field("ma_collection_privée", description="Name of the private collection for the user")
+    albert_all_public_command: str = Field("<all_public>", description="Command to use to get all public collections")
     conversation_obsolescence: int = Field(
         15 * 60, description="time after which a conversation is considered obsolete, in seconds"
     )
-    last_rag_references: list[dict] | None = Field(None, description="Last sources used for the RAG.")
+    last_rag_chunks: list[dict] | None = Field(None, description="Last chunks used for the RAG.")
 
     @property
     def is_conversation_obsolete(self) -> bool: