-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrag_llama_index_tulu_app.py
193 lines (146 loc) · 7.18 KB
/
rag_llama_index_tulu_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# Minimalistic chatbot app example combining LLMs and Retrieval-Augmented Generation.
# The idea is to have a generative language model answer questions exclusively on the basis
# of the provided source material as opposed to relying on previous knowledge or hallucination.
# MORE INFO
# https://github.com/mirix/retrieval-augmented-generation
import os
n_cores = os.cpu_count()
os.environ['OMP_NUM_THREADS'] = str(n_cores)
os.environ['MKL_NUM_THREADS'] = str(n_cores)
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
# I am explicitly using two GPUs
# You can change this to 0 if you only have one, etc.
# You can also comment this out and check how to use device='auto'
# to autodetect the hardware configuration
os.environ["CUDA_VISIBLE_DEVICES"]="1,0"
### REQUIREMENTS ###
# Flask, PyMuPDF, transformers, ctransformers, llama_index
# You may want to install a version of PyTorch that matches your requirements
# (as opposed to the one pulled by transformers)
# Check the error messages for additional dependencies
from flask import Flask, render_template, request, Response
import fitz
from unidecode import unidecode
from ctransformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from llama_index import (Document,
ServiceContext,
VectorStoreIndex,
set_global_service_context,
SimpleDirectoryReader)
from llama_index.llms import HuggingFaceLLM
from llama_index.embeddings import HuggingFaceEmbedding
from llama_index.prompts import PromptTemplate
### APP ###
app = Flask(__name__)
app.static_folder = 'static'
@app.route('/')
def home():
return render_template('index.html')
### MODEL ###
# We are using Tulu v2 7B DPO.
# We are using GGUF quantisation because it enables us to use the CPU and decide how many layers we want to offload to the GPU.
# For GPU-only AQW-quantised models will likely result in better performance (check what TheBloke is offering in Huggingface).
# With the current model, if the context is not too long, probably 8GB of VRAM will suffice. You can switch to the Q4 if you want to save VRAM.
# Downloading the models may take a while (several GB). You may prefer to download them beforehand and point to the local folders instead.
name = 'allenai/tulu-2-dpo-7b'
model_name = 'TheBloke/tulu-2-dpo-7B-GGUF'
model_file = 'tulu-2-dpo-7b.Q5_K_M.gguf'
# Set the gpu_layers according to your system. In this case, my system does not use more than 35. 0 means no GPU, CPU-only
# Llama 2 models such as Tulu, can by default handle a maximum of 4096 tokens. But there are models with longer context window out there.
model = AutoModelForCausalLM.from_pretrained(model_name, model_file=model_file, context_length=4096, gpu_layers=512, seed=0, hf=True)
tokenizer = AutoTokenizer.from_pretrained(name, legacy=False)
# Current models very sensitive to the wording of the system prompt (and the prompt in general).
# Finding the right working may take some trial-and-error. Some call this "prompt engineering".
# I call it dumb models wasting my time.
system_prompt = 'Please, check if the anwser can be inferred from the pieces of context provided. If the answer cannot be inferred from the context, just state that the question is out of scope and do not provide any answer.'
prompt_template = '<|system|>\n' + system_prompt + '</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n'
llm = HuggingFaceLLM(
model=model,
tokenizer=tokenizer,
query_wrapper_prompt=PromptTemplate(prompt_template),
context_window=3072,
max_new_tokens=512,
tokenizer_kwargs={'max_length': 4096, 'legacy': False},
# Set the gpu_layers according to your system. In this case, my system does not use more than 35. 0 means no GPU, CPU-only
# If hyperthreading is not activated in your system you can remove the //2
# You can set the seed for reproducibility
model_kwargs={'n_gpu_layers': 512, 'n_threads': n_cores//2, 'seed': 0},
# Here you can play with temperature, top_p, etc.
# This will tune how creative the response can get.
# The higher the values, the more creative (and therefore hallucination-prone)
# In this case, we prefer a very conservative response.
generate_kwargs={'do_sample': True, 'temperature': 0.0000001, 'top_p': 0.0000001, 'top_k': 1},
)
# The embedding model can be decisive for the quality of the retrieval phase.
# If you have issues, you may want to switch to a different one.
embed_model = HuggingFaceEmbedding(model_name='thenlper/gte-large')
### CONTEX ###
# The chunk size needs to be tuned.
# The longer, the more resources the app will use.
# Rule of thumb:
# Smaller chunk size -> More information will be retreived.
# Longer chunk size -> More cwantonnections between distant pieces of information (if your model is good enough).
# Here we are using small models so we go for the smallest possible size.
service_context = ServiceContext.from_defaults(
chunk_size=200,
llm=llm,
embed_model=embed_model
)
set_global_service_context(service_context)
# DOCUMENTS
# The quality of the extracted text formatting is also crucial for the retrieval phase.
# You need to make sure you do this right, as inconsitent formatting may result in incomplete retrieval.
# For testing purposes, we are using a single PDF file.
pdf_path = 'pdf/The_Little_Prince_Antoine_de_Saint_Exupery.pdf'
# We use PyMuPDF to extract the text from the PDF
doc = fitz.open(pdf_path)
# We extract blocks (roughly speaking paragraphs).
# Block are identified primariry from geometrical information.
paragraphs = []
for page in doc:
blocks = page.get_text('blocks')
for block in blocks:
paragraph = unidecode(block[4]).replace('\n', ' ').strip()
# We remove info about images. You will need to adapt this.
if not paragraph.startswith('<image: ') and not paragraph.endswith('.jpg'):
paragraphs.append(paragraph)
# PyMuPDF does the extraction page by page.
# We try to merge truncated paragraphs.
# The criteria are the stop tokens below.
# You can resort to an NLP library rather than using an ad hoc list.
# These are obviously, language-dependent.
stops = ['.', '!', '?', ':', '"']
merged_paragraphs = paragraphs[:1]
for p in paragraphs[1:]:
if merged_paragraphs[-1][-1] not in stops:
merged_paragraphs[-1] = merged_paragraphs[-1] + ' ' + p
else:
merged_paragraphs.append(p)
text = '\n\n'.join(merged_paragraphs)
documents = [Document(text=text)]
print('Number of documents:', len(documents))
# VECTOR INDEX
index = VectorStoreIndex.from_documents(documents)
### QUERY ###
query_engine = index.as_query_engine()
### APP FUNCTION ###
@app.route('/get')
def get_bot_response():
try: msg
except NameError: msg = ''
prompt = request.args.get('msg')
response = str(query_engine.query(prompt))
if response[-1] != '.':
response = '.'.join(response.split('.')[:-1]) + '.'
return response
# Open the URLs indicated in the console output in your browser.
# You can choose another port.
# For instance, the following will work locally:
# http://localhost:5000
# http://127.0.0.1:5000
# If your server is accesible in a local network or the internet, you can do the following from a remote client:
# http://HOST_NAME:5000
# http://HOST_IP:5000
if __name__ == '__main__':
app.run(host = "0.0.0.0", port = 5000, debug = False)