-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathllm_messenger_history.py
144 lines (120 loc) · 5.28 KB
/
llm_messenger_history.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import json
import os
import time
from langchain.document_loaders import FacebookChatLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from dotenv import load_dotenv
import argparse
from datetime import datetime
def main():
parser = argparse.ArgumentParser(description='Facebook Messenger Chat Analysis')
parser.add_argument('--model',
choices=['gpt-3.5-turbo', 'gpt-4'],
default='gpt-3.5-turbo',
help='Choose the OpenAI model to use (default: gpt-3.5-turbo)')
args = parser.parse_args()
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
raise ValueError("Please set your OpenAI API key in the .env file.")
username = os.getenv("USERNAME")
if not username:
raise ValueError("Please set your USERNAME in the .env file.")
folder_path = f"./data/{username}/messages/inbox"
if not os.path.exists(folder_path):
raise FileNotFoundError(f"The folder path {folder_path} does not exist.")
print("Loading Facebook Messenger data...")
documents = []
for root, dirs, files in os.walk(folder_path):
for file in files:
if file.endswith('.json'):
file_path = os.path.join(root, file)
loader = FacebookChatLoader(path=file_path)
documents.extend(loader.load())
print(f"Loaded {len(documents)} documents.")
print("Splitting documents into chunks...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(documents)
print(f"Split into {len(docs)} chunks.")
print("Creating embeddings and building vector store...")
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
tokens_per_chunk = 250
rate_limit_tpm = 1000000
batch_size = 100
total_chunks = len(docs)
total_tokens = total_chunks * tokens_per_chunk
tokens_per_batch = batch_size * tokens_per_chunk
batches_per_minute = rate_limit_tpm / tokens_per_batch
optimal_wait_time = (60 / batches_per_minute) * 1.1
print(f"\nProcessing {total_chunks} chunks (~{total_tokens:,} tokens)")
print(f"Estimated optimal wait time between batches: {optimal_wait_time:.2f}s")
print(f"Estimated total processing time: {(total_chunks/batch_size * optimal_wait_time)/60:.1f} minutes\n")
vectorstore = None
wait_time = optimal_wait_time
start_time = datetime.now()
for i in range(0, len(docs), batch_size):
end_idx = min(i + batch_size, len(docs))
batch_num = i//batch_size + 1
total_batches = len(docs)//batch_size + 1
elapsed_time = (datetime.now() - start_time).total_seconds() / 60
print(f"Batch {batch_num}/{total_batches} ({(batch_num/total_batches)*100:.1f}%) - "
f"Elapsed: {elapsed_time:.1f}m")
max_retries = 3
for attempt in range(max_retries):
try:
batch_docs = docs[i:end_idx]
if vectorstore is None:
vectorstore = FAISS.from_documents(batch_docs, embeddings)
else:
batch_vectorstore = FAISS.from_documents(batch_docs, embeddings)
vectorstore.merge_from(batch_vectorstore)
break
except Exception as e:
if "Rate limit" in str(e) and attempt < max_retries - 1:
wait_time *= 2
print(f"Rate limit hit. Waiting {wait_time:.1f}s before retry...")
time.sleep(wait_time)
wait_time = max(optimal_wait_time, wait_time * 0.75)
else:
raise e
if end_idx < len(docs):
print(f"Waiting {wait_time:.1f}s...")
time.sleep(wait_time)
total_time = (datetime.now() - start_time).total_seconds() / 60
print(f"\nVector store creation completed in {total_time:.1f} minutes")
print(f"Setting up conversational AI using {args.model}...")
llm = ChatOpenAI(
temperature=0.7,
model_name=args.model,
openai_api_key=OPENAI_API_KEY
)
qa = ConversationalRetrievalChain.from_llm(
llm,
vectorstore.as_retriever(),
return_source_documents=True
)
def chat():
print("Chat with your Facebook Messenger AI (type 'exit' to quit):")
chat_history = []
while True:
query = input("You: ")
if query.lower() in ('exit', 'quit'):
print("Exiting chat.")
break
if not query.strip():
continue
result = qa({"question": query, "chat_history": chat_history})
answer = result["answer"]
print(f"\nAI: {answer}\n")
if "source_documents" in result:
sources = [doc.page_content[:400] for doc in result["source_documents"][:3]]
sources_json = json.dumps({"sources": sources}, indent=2)
print(f"\nSources: {sources_json}")
chat_history.append((query, answer))
chat()
if __name__ == "__main__":
main()