implementando o chatweaviate

politicahacker · Oct 5, 2023 · cb0d36c · cb0d36c
1 parent d0c3c0b
commit cb0d36c
Show file tree

Hide file tree

Showing 4 changed files with 237 additions and 4 deletions.
diff --git a/app/agent/lex_chatweaviate.py b/app/agent/lex_chatweaviate.py
@@ -0,0 +1,80 @@
+import os
+
+#LLM
+from langchain import OpenAI
+from langchain.chat_models import ChatOpenAI
+from langchain.schema import SystemMessage
+from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder
+#Memory
+from langchain.memory import ConversationBufferMemory
+from langchain.chains import LLMChain
+
+#CallBack
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+
+#Weaviate Memory
+from tools.uploadLib import Library
+
+library = Library(os.getenv('WEAVIATE_URL'), os.getenv('WEAVIATE_API_KEY'))
+
+from langchain.schema.messages import HumanMessage
+
+class DynamicLibraryPromptTemplate(HumanMessagePromptTemplate):
+    def validate_input_variables(cls, v):
+        # Valide suas variáveis de entrada aqui
+        return v
+
+    def format(self, **kwargs) -> str:
+        # Puxe o human_input do kwargs
+        human_input = kwargs.get("human_input")
+
+        # Obtenha informações da biblioteca com base no human_input
+        sources = library.get_sources(human_input)
+
+        # Formate 'sources' para incluí-los no prompt
+        formatted_sources = self.format_sources(sources)
+
+        # Crie o prompt final
+        text = f"Esses são os trechos de documentos da nossa biblioteca.:\n{formatted_sources}\nSempre que citar Atenção:\n1) Inclua o nome dos documentos e número da página utilizados na resposta com o formato ('nomedodocumento', 'numero_pg')\n\nAnonimize todas as referências a nomes de pessoas ou marcas.\nResponda da melhor maneira possível a seguinte \n\npergunta:{human_input}"
+        return HumanMessage(content=text, additional_kwargs=self.additional_kwargs)
+
+    def format_sources(self, data):
+        formatted_text = ""
+
+        for document in data['data']['Get']['Document']:
+            content = document['content']
+            file_name = document['fileName']
+            page_or_chunk = document['pageOrChunk']
+
+            formatted_text += f"### Fonte: {file_name}, Página: {page_or_chunk} ###\n"
+            formatted_text += f"{content}\n"
+            formatted_text += f"{'='*50}\n"
+
+        return formatted_text
+
+    def _prompt_type(self):
+        return "dynamic-library"
+
+
+#Prompts
+from .prompts import SYS_PROMPT
+OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
+
+#Define o LLM
+llm = ChatOpenAI(model_name="gpt-3.5-turbo")
+
+prompt = ChatPromptTemplate.from_messages([
+    SystemMessage(content="Você é a Assistente Digital da Talk, uma espécie de oraculo digital que tem acesso a todos os documentos já produzidos pela empresa. A Talk é uma empresa de pesquisa com uma metodologia bastante focada em pesquisas qualitativas, buscando identificar e encontrar usuários chaves no tema pesquisado e fazendo anáise em profundidade. Para cada pergunta do usuário, você receberá até 3 respostas do banco de dados para formular suas considerações. Traga insights e provocações relevantes sempre após uma análise."), # The persistent system prompt
+    MessagesPlaceholder(variable_name="chat_history"), # Where the memory will be stored.
+    DynamicLibraryPromptTemplate.from_template("{human_input}"), # Where the human input will injected
+])
+
+memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
+llm = ChatOpenAI(streaming=True, callbacks=[StreamingStdOutCallbackHandler()])
+
+chat_llm_chain = LLMChain(
+    llm=llm,
+    prompt=prompt,
+    verbose=True,
+    memory=memory,
+)
diff --git a/app/app.py b/app/app.py
@@ -8,9 +8,10 @@
 import whisper
 from utils import MODEL_DIRECTORY
 
-transcriber = whisper.load_model("medium", download_root=MODEL_DIRECTORY)
+#transcriber = whisper.load_model("medium", download_root=MODEL_DIRECTORY)
+transcriber =  None
 
-ACTIVE_AGENTS = ["lex_chatgpt"]#, "lex_llama"]
+ACTIVE_AGENTS = ["lex_chatweaviate"]#, "lex_llama"]
 script_dir = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(script_dir)
 
@@ -85,7 +86,10 @@ def handle_message(message):
 @socketio.on('audioMessage')
 def handle_audioMessage(audio_blob):
     room=request.sid
-    socketio.start_background_task(audio_task, audio_blob, room)
+    if transcriber:
+        socketio.start_background_task(audio_task, audio_blob, room)
+    else:
+        socketio.emit('message', {'result' : 'Transcriber esta desligado no momento.'})
 
 def audio_task(audio_blob, room):    # Carregar modelo e transcrever o áudio
     # Salvar o blob de áudio como um arquivo temporário

diff --git a/app/templates/index.html b/app/templates/index.html
@@ -54,7 +54,7 @@ <h1>Lex.AI</h1>
         // Carregando histórico anterior (se houver)
         var chatHistory = JSON.parse(localStorage.getItem('chatHistory')) || [];
         chatHistory.forEach(function(entry) {
-            addMessage(entry.message, entry.classe);
+            //addMessage(entry.message, entry.classe);
         });
 
         function addMessage(message, classe) {

diff --git a/app/tools/uploadLib.py b/app/tools/uploadLib.py
@@ -0,0 +1,149 @@
+import os
+import argparse
+from tqdm import tqdm
+from typing import List
+from unstructured.partition.auto import partition
+from unstructured.chunking.title import chunk_by_title
+from unstructured.cleaners.core import clean_extra_whitespace, group_broken_paragraphs
+import weaviate
+import logging
+
+logging.basicConfig(level=logging.INFO)
+console = logging.getLogger(__name__)
+
+class Library:
+    def __init__(self, weaviate_url, weaviate_api_key):
+        auth_config = weaviate.AuthApiKey(api_key=weaviate_api_key)
+        self.client = weaviate.Client(
+            url=weaviate_url,
+            auth_client_secret=auth_config,
+            additional_headers={"X-OpenAI-Api-Key": os.getenv('OPENAI_API_KEY')}
+        )
+
+    def get_sources(self,question):
+        nearText = {
+            "concepts": question,
+            }
+
+        result = (self.client.query
+                  .get("Document", ["content", "pageOrChunk", "fileName"])
+                  .with_near_text(nearText)
+                  .with_limit(3)
+                  .do())
+
+        return(result)
+
+
+class WeaviateUploader:
+    def __init__(self, weaviate_url, weaviate_api_key):
+        auth_config = weaviate.AuthApiKey(api_key=weaviate_api_key)
+        self.client = weaviate.Client(
+            url=weaviate_url,
+            auth_client_secret=auth_config,
+            additional_headers={"X-OpenAI-Api-Key": os.getenv('OPENAI_API_KEY')}
+        )
+        if not self.client.schema.exists("Document"):
+            self.create_schema()
+
+
+    def create_schema(self):
+        document_schema = {
+            "class": "Document",
+            "description": "A collection of documents",
+            "vectorizer": "text2vec-openai",
+            "properties": [
+                {
+                    "name": "fileName",
+                    "description": "Name of the file",
+                    "dataType": ["string"]
+                },
+                {
+                    "name": "pageOrChunk",
+                    "description": "Page or chunk of the document",
+                    "dataType": ["number"]
+                },
+                {
+                    "name": "content",
+                    "description": "Content of the document",
+                    "dataType": ["text"]
+                }
+            ],
+            "moduleConfig": {
+                "text2vec-openai": {
+                    "vectorizeClassName": True
+                }
+            } 
+        }
+
+
+        self.client.schema.create_class(document_schema)
+
+    def check_existing_file(self, filename):
+        console.debug(f"Verificando se o arquivo {filename} já está indexado...")
+        query = self.client.query.get("Document", ["fileName"]).with_where({
+            "path": ["fileName"],
+            "operator": "Equal",
+            "valueText": filename
+        }).with_limit(1).do()
+        return bool(query and query["data"]["Get"]["Document"])
+
+    def upload_file(self, file_path, index_name):
+        if self.check_existing_file(file_path):
+            return None
+        console.debug(f"Particionando {file_path}...")
+        documents = partition(filename=file_path, include_page_breaks=True)
+        chunks = chunk_by_title(documents)
+        weaviate_objects = []
+        for index, doc in enumerate(chunks):
+            content = doc.__str__()
+            clean_extra_whitespace(content)
+            group_broken_paragraphs(content)
+            pg = doc.metadata.page_number
+            if not pg:
+                pg = index
+
+            if content:
+                obj = {
+                        "fileName": file_path,
+                        "pageOrChunk": pg,
+                        "content": content
+                }
+                #console.info(obj)
+                weaviate_objects.append(obj)
+
+        console.info(f"Subindo {file_path} no Weaviate...")
+        for obj in tqdm(weaviate_objects):
+            #pass
+            self.client.data_object.create(obj, index_name)
+
+    def iterate_directory_and_upload(self, directory_path, index_name, allowed_filetypes):
+        for root, dirs, files in os.walk(directory_path):
+            for file in files:
+                file_path = os.path.join(root, file)
+                file_type = file_path.split('.')[-1]
+
+                if file_type in allowed_filetypes:
+                    self.upload_file(file_path, index_name)
+                else:
+                    console.debug(f"Tipo de arquivo {file_type} não é permitido. Ignorando {file}.")
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Weaviate File Uploader")
+    parser.add_argument("action", choices=['upload', 'query'], help="Action to perform: upload files or query using get_sources.")
+    parser.add_argument("--directory", help="Path to the directory you want to upload files from.")
+    parser.add_argument("--question", help="The question for get_sources, only needed if action is 'query'.")
+    args = parser.parse_args()
+
+    WEAVIATE_URL = os.getenv('WEAVIATE_URL')
+    WEAVIATE_API_KEY = os.getenv('WEAVIATE_API_KEY')
+
+    if args.action == "upload":
+        uploader = WeaviateUploader(WEAVIATE_URL, WEAVIATE_API_KEY)
+        uploader.iterate_directory_and_upload(args.directory, "Document", ['pdf', 'txt', 'docx'])
+    elif args.action == "query":
+        if not args.question:
+            print("You need to specify a question using --question when action is 'query'.")
+        else:
+            library = Library(WEAVIATE_URL, WEAVIATE_API_KEY)
+            sources = library.get_sources(args.question)
+            print(f"Sources for the question: {sources}")