-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbackend_functions.py
211 lines (169 loc) · 9.19 KB
/
backend_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import json
import openai
import tiktoken
import pinecone
import cohere
import ast
import numpy as np
import pandas as pd
from datetime import datetime
import re
import utils
cohere_api_key = utils.cohere_api_key
pinecone_api_key = utils.pinecone_api_key
pinecone_environment = utils.pinecone_environment
openai.api_key = utils.openai_api_key
# initialize pinecone and connetct to the index
pinecone_environment = "us-west4-gcp-free"
pinecone.init(pinecone_api_key, environment=pinecone_environment)
pinecone_index_name = "salman-khan"
index = pinecone.Index(pinecone_index_name)
# initializing cohere client
co = cohere.Client(cohere_api_key)
# Prompts
delimiter = "####"
current_time = datetime.now()
category_prompt = f"""You will be provided with a query. \
The user query will be delimited with {delimiter} characters.
your task is to classify each query into a category, either time related or general log query. \
Time related queries are those where there is a mention of a time or time period like month, year, day or any date etc. \
Provide your output in json format where \
keys: category.
values: T for Time related query and G for general log query
Examples for time related queries: Where were you last night, how much you spent on grocery this month?, what was the last entry in journal
Examples for general log queries: What I like to have in dinner?, where have I spent most money, what book I read the most"""
time_prompt = f"""You will be provided with a query. \
The user query will be delimited with {delimiter} characters.
The user will provide a time-bound query with some timeframe in it. for example, today, yesterday, this week, 6 days ago, three months ago, last year, between january and march,first quarter etc. \
Detect the timeframe from the query and produce start and end date. \
Today's date is {current_time}. Use today's date as an end date if no end date is detected in the query. \
Otherwise use end date specified in a user's query. \
The start date is based on user's query. Don't share code just give the output in json format. \
In json the values which are the dates should not contain any '-' and should be in format YYYYMMDD.\
Provide your output in json format for Example: ('start_date' : 20210101, 'end_date': 20220108)"""
answering_prompt = f"""you are a personal assistant for a user. The user has recorded journal entries along with timestamps which may contain\
information about the user's experiences, thoughts, activities, personal reflections, descriptions of events, financial transactions, or any \
relevant details. Each unique journal entry will be divided by a delimitter {delimiter}.\
Your task is to answer a user query using those journal entries as context. You should provide insightful and accurate responses based on the\
information available in the journal entries.
The user wants to retrieve information about a particular event mentioned in their journal entries. The user query is classified into two categories,\
Time related or General Log query. If the category is T, it means the query\
is time related and if it is G, it means it is a general log query.\
For general and time related log queries, provide to the point but accurate response based on the information provided in the context. \
If there is no keyword matching in the query"""
log_query_prompt = f"""you are a personal assistant for a user. The user is either going to record journal entries along with timestamps which may contain\
information about the user's experiences, thoughts, activities, personal reflections, descriptions of events, financial transactions, or any \
relevant details. Or the user is going to ask a query regarding user's existing records. \
Each unique journal entry will be divided by a delimitter {delimiter}.\
Your task is to classify whether the given command is a query or a journal entry. \
Mostly user queries are interrogative, or commanding in nature as compared to journal entries. \
Examples for general journal entries: I did not go to pakistan tour, I had a meeting from 2-3pm, I cried yesterday. \
Examples for user queries: What I like to have in dinner?, where have I spent most money, what book I read the most \
Provide your output in string values: 'Q' for query and 'E' for general log entry"""
# function to generate response from GPT
def get_completion_from_messages(messages, model="gpt-3.5-turbo-16k", temperature=0, max_tokens=500):
response = openai.ChatCompletion.create(model=model, messages=messages,
temperature=temperature, max_tokens=max_tokens)
return response.choices[0].message["content"]
# find similar general entries
def find_similar(query, namespace='test'):
xq = co.embed(texts=[query], model='large', truncate='LEFT').embeddings
similar = index.query(xq, top_k=300, include_metadata=True, namespace=namespace)
texts = []
for i in similar['matches']:
texts.append(i.metadata['text'])
return texts
# find similar time related entries
def find_similar_time(query, startDate, endDate, namespace='test'):
conditions = {'date': {'$gte': startDate, '$lte': endDate}}
xq = co.embed(texts=[query], model='large', truncate='LEFT').embeddings
similar = index.query(queries=xq, top_k=30, filter=conditions, include_metadata=True, namespace=namespace)
texts = [match['metadata']['text'] for match in similar['results'][0]['matches']]
return texts
# answer general query
def genQuery(query, category, namespace='test'):
current_time = datetime.now()
query = f'{current_time} {query}'
context = find_similar(query, namespace=namespace)
context_ = " #### ".join(context)
messages = [{'role':'system', 'content': category},
{'role':'system', 'content': answering_prompt},
{'role':'user', 'content': context_},
{'role':'user', 'content': query}]
response = get_completion_from_messages(messages)
return response
# answer time related query
def timeQuery(query, category, startDate, endDate, namespace='test'):
current_time = datetime.now()
query = f'{current_time} {query}'
context = find_similar_time(query,startDate,endDate, namespace=namespace)
context_ = " #### ".join(context)
messages = [{'role':'system', 'content': category},
{'role':'system', 'content': answering_prompt},
{'role':'user', 'content': context_},
{'role':'user', 'content': query}]
response = get_completion_from_messages(messages)
return response
# function for any type of query
def ask_PA(query, namespace='test'):
msg_ctg = [{'role':'system', 'content': category_prompt},
{'role':'user', 'content': query}]
category_value = get_completion_from_messages(msg_ctg)
category = json.loads(category_value)['category']
if category == "T":
time_ctg = [{'role':'system', 'content': time_prompt},
{'role':'user', 'content': query}]
timeframe = get_completion_from_messages(time_ctg)
startDate = json.loads(timeframe)['start_date']
startDate = float(startDate)
endDate = json.loads(timeframe)['end_date']
endDate = float(endDate)
return timeQuery(query, category, startDate, endDate, namespace=namespace)
else:
return genQuery(query, category, namespace=namespace)
# function to add new journal entries
def new_entry(entry, namespace='test'):
current_time = datetime.now().strftime('%Y/%m/%d %H:%M')
entry = f'Time: {current_time}, Entry: {entry}'
emb = co.embed(model='embed-english-v2.0', texts=[entry]).embeddings
emb = [[float(e) for e in sublist] for sublist in emb]
index_stats = index.describe_index_stats()
try:
vector_count = index_stats['namespaces'][namespace]['vector_count']
ids = str(vector_count)
except KeyError:
ids = '0'
date = re.search(r'Time: (\d{4}/\d{2}/\d{2})', entry).group(1)
date = float(date.replace('/', ''))
meta = {'date': date, 'text': entry}
to_upsert=[{'id': ids, "values":emb[0], "metadata": meta}]
index.upsert(vectors=to_upsert,namespace=namespace)
return "Entry added successfully"
# final PA
def final_PA(query, namespace = 'test'):
messages = [{'role':'system', 'content': log_query_prompt},
{'role':'user', 'content': query}]
response = get_completion_from_messages(messages)
if response == 'Q':
return ask_PA(query, namespace = namespace)
elif response == 'E':
return new_entry(query, namespace= namespace)
else:
return str('Please enter your input')
def access_entries(namespace,k=25):
entries = index.query(namespace=namespace,top_k=10000,id='0',include_metadata=True)
sorted_data = sorted(entries['matches'], key=lambda x: int(x['id']))
result = [entry['metadata']['text'] for entry in sorted_data[-k:]]
# Initialize empty lists to store 'Time' and 'Entry' values
time_list = []
entry_list = []
# Extract 'Time' and 'Entry' values from each string and append them to the respective lists
for entry in result:
parts = entry.split(', Entry: ', maxsplit = 1)
time = parts[0].replace('Time: ', '')
entry_text = parts[1]
time_list.append(time)
entry_list.append(entry_text)
# Create a dataframe using the lists
df = pd.DataFrame({'Time': time_list, 'Entry': entry_list}).set_index('Time').sort_index(ascending=False)
return df