Skip to content

Commit

Permalink
implementando o chatweaviate
Browse files Browse the repository at this point in the history
  • Loading branch information
pmarkun committed Oct 5, 2023
1 parent d0c3c0b commit cb0d36c
Show file tree
Hide file tree
Showing 4 changed files with 237 additions and 4 deletions.
80 changes: 80 additions & 0 deletions app/agent/lex_chatweaviate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import os

#LLM
from langchain import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder
#Memory
from langchain.memory import ConversationBufferMemory
from langchain.chains import LLMChain

#CallBack
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

#Weaviate Memory
from tools.uploadLib import Library

library = Library(os.getenv('WEAVIATE_URL'), os.getenv('WEAVIATE_API_KEY'))

from langchain.schema.messages import HumanMessage

class DynamicLibraryPromptTemplate(HumanMessagePromptTemplate):
def validate_input_variables(cls, v):
# Valide suas variáveis de entrada aqui
return v

def format(self, **kwargs) -> str:
# Puxe o human_input do kwargs
human_input = kwargs.get("human_input")

# Obtenha informações da biblioteca com base no human_input
sources = library.get_sources(human_input)

# Formate 'sources' para incluí-los no prompt
formatted_sources = self.format_sources(sources)

# Crie o prompt final
text = f"Esses são os trechos de documentos da nossa biblioteca.:\n{formatted_sources}\nSempre que citar Atenção:\n1) Inclua o nome dos documentos e número da página utilizados na resposta com o formato ('nomedodocumento', 'numero_pg')\n\nAnonimize todas as referências a nomes de pessoas ou marcas.\nResponda da melhor maneira possível a seguinte \n\npergunta:{human_input}"
return HumanMessage(content=text, additional_kwargs=self.additional_kwargs)

def format_sources(self, data):
formatted_text = ""

for document in data['data']['Get']['Document']:
content = document['content']
file_name = document['fileName']
page_or_chunk = document['pageOrChunk']

formatted_text += f"### Fonte: {file_name}, Página: {page_or_chunk} ###\n"
formatted_text += f"{content}\n"
formatted_text += f"{'='*50}\n"

return formatted_text

def _prompt_type(self):
return "dynamic-library"


#Prompts
from .prompts import SYS_PROMPT
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

#Define o LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo")

prompt = ChatPromptTemplate.from_messages([
SystemMessage(content="Você é a Assistente Digital da Talk, uma espécie de oraculo digital que tem acesso a todos os documentos já produzidos pela empresa. A Talk é uma empresa de pesquisa com uma metodologia bastante focada em pesquisas qualitativas, buscando identificar e encontrar usuários chaves no tema pesquisado e fazendo anáise em profundidade. Para cada pergunta do usuário, você receberá até 3 respostas do banco de dados para formular suas considerações. Traga insights e provocações relevantes sempre após uma análise."), # The persistent system prompt
MessagesPlaceholder(variable_name="chat_history"), # Where the memory will be stored.
DynamicLibraryPromptTemplate.from_template("{human_input}"), # Where the human input will injected
])

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
llm = ChatOpenAI(streaming=True, callbacks=[StreamingStdOutCallbackHandler()])

chat_llm_chain = LLMChain(
llm=llm,
prompt=prompt,
verbose=True,
memory=memory,
)
10 changes: 7 additions & 3 deletions app/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@
import whisper
from utils import MODEL_DIRECTORY

transcriber = whisper.load_model("medium", download_root=MODEL_DIRECTORY)
#transcriber = whisper.load_model("medium", download_root=MODEL_DIRECTORY)
transcriber = None

ACTIVE_AGENTS = ["lex_chatgpt"]#, "lex_llama"]
ACTIVE_AGENTS = ["lex_chatweaviate"]#, "lex_llama"]
script_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.append(script_dir)

Expand Down Expand Up @@ -85,7 +86,10 @@ def handle_message(message):
@socketio.on('audioMessage')
def handle_audioMessage(audio_blob):
room=request.sid
socketio.start_background_task(audio_task, audio_blob, room)
if transcriber:
socketio.start_background_task(audio_task, audio_blob, room)
else:
socketio.emit('message', {'result' : 'Transcriber esta desligado no momento.'})

def audio_task(audio_blob, room): # Carregar modelo e transcrever o áudio
# Salvar o blob de áudio como um arquivo temporário
Expand Down
2 changes: 1 addition & 1 deletion app/templates/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ <h1>Lex.AI</h1>
// Carregando histórico anterior (se houver)
var chatHistory = JSON.parse(localStorage.getItem('chatHistory')) || [];
chatHistory.forEach(function(entry) {
addMessage(entry.message, entry.classe);
//addMessage(entry.message, entry.classe);
});

function addMessage(message, classe) {
Expand Down
149 changes: 149 additions & 0 deletions app/tools/uploadLib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
import os
import argparse
from tqdm import tqdm
from typing import List
from unstructured.partition.auto import partition
from unstructured.chunking.title import chunk_by_title
from unstructured.cleaners.core import clean_extra_whitespace, group_broken_paragraphs
import weaviate
import logging

logging.basicConfig(level=logging.INFO)
console = logging.getLogger(__name__)

class Library:
def __init__(self, weaviate_url, weaviate_api_key):
auth_config = weaviate.AuthApiKey(api_key=weaviate_api_key)
self.client = weaviate.Client(
url=weaviate_url,
auth_client_secret=auth_config,
additional_headers={"X-OpenAI-Api-Key": os.getenv('OPENAI_API_KEY')}
)

def get_sources(self,question):
nearText = {
"concepts": question,
}

result = (self.client.query
.get("Document", ["content", "pageOrChunk", "fileName"])
.with_near_text(nearText)
.with_limit(3)
.do())

return(result)


class WeaviateUploader:
def __init__(self, weaviate_url, weaviate_api_key):
auth_config = weaviate.AuthApiKey(api_key=weaviate_api_key)
self.client = weaviate.Client(
url=weaviate_url,
auth_client_secret=auth_config,
additional_headers={"X-OpenAI-Api-Key": os.getenv('OPENAI_API_KEY')}
)
if not self.client.schema.exists("Document"):
self.create_schema()


def create_schema(self):
document_schema = {
"class": "Document",
"description": "A collection of documents",
"vectorizer": "text2vec-openai",
"properties": [
{
"name": "fileName",
"description": "Name of the file",
"dataType": ["string"]
},
{
"name": "pageOrChunk",
"description": "Page or chunk of the document",
"dataType": ["number"]
},
{
"name": "content",
"description": "Content of the document",
"dataType": ["text"]
}
],
"moduleConfig": {
"text2vec-openai": {
"vectorizeClassName": True
}
}
}


self.client.schema.create_class(document_schema)

def check_existing_file(self, filename):
console.debug(f"Verificando se o arquivo {filename} já está indexado...")
query = self.client.query.get("Document", ["fileName"]).with_where({
"path": ["fileName"],
"operator": "Equal",
"valueText": filename
}).with_limit(1).do()
return bool(query and query["data"]["Get"]["Document"])

def upload_file(self, file_path, index_name):
if self.check_existing_file(file_path):
return None
console.debug(f"Particionando {file_path}...")
documents = partition(filename=file_path, include_page_breaks=True)
chunks = chunk_by_title(documents)
weaviate_objects = []
for index, doc in enumerate(chunks):
content = doc.__str__()
clean_extra_whitespace(content)
group_broken_paragraphs(content)
pg = doc.metadata.page_number
if not pg:
pg = index

if content:
obj = {
"fileName": file_path,
"pageOrChunk": pg,
"content": content
}
#console.info(obj)
weaviate_objects.append(obj)

console.info(f"Subindo {file_path} no Weaviate...")
for obj in tqdm(weaviate_objects):
#pass
self.client.data_object.create(obj, index_name)

def iterate_directory_and_upload(self, directory_path, index_name, allowed_filetypes):
for root, dirs, files in os.walk(directory_path):
for file in files:
file_path = os.path.join(root, file)
file_type = file_path.split('.')[-1]

if file_type in allowed_filetypes:
self.upload_file(file_path, index_name)
else:
console.debug(f"Tipo de arquivo {file_type} não é permitido. Ignorando {file}.")

if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Weaviate File Uploader")
parser.add_argument("action", choices=['upload', 'query'], help="Action to perform: upload files or query using get_sources.")
parser.add_argument("--directory", help="Path to the directory you want to upload files from.")
parser.add_argument("--question", help="The question for get_sources, only needed if action is 'query'.")
args = parser.parse_args()

WEAVIATE_URL = os.getenv('WEAVIATE_URL')
WEAVIATE_API_KEY = os.getenv('WEAVIATE_API_KEY')

if args.action == "upload":
uploader = WeaviateUploader(WEAVIATE_URL, WEAVIATE_API_KEY)
uploader.iterate_directory_and_upload(args.directory, "Document", ['pdf', 'txt', 'docx'])
elif args.action == "query":
if not args.question:
print("You need to specify a question using --question when action is 'query'.")
else:
library = Library(WEAVIATE_URL, WEAVIATE_API_KEY)
sources = library.get_sources(args.question)
print(f"Sources for the question: {sources}")

0 comments on commit cb0d36c

Please sign in to comment.