forked from PromtEngineer/localGPT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_localGPT.py
88 lines (74 loc) · 3.17 KB
/
run_localGPT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from langchain.chains import RetrievalQA
# from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.llms import HuggingFacePipeline
from constants import CHROMA_SETTINGS, PERSIST_DIRECTORY
from transformers import LlamaTokenizer, LlamaForCausalLM, pipeline
import click
from constants import CHROMA_SETTINGS
def load_model():
'''
Select a model on huggingface.
If you are running this for the first time, it will download a model for you.
subsequent runs will use the model from the disk.
'''
model_id = "TheBloke/vicuna-7B-1.1-HF"
tokenizer = LlamaTokenizer.from_pretrained(model_id)
model = LlamaForCausalLM.from_pretrained(model_id,
# load_in_8bit=True, # set these options if your GPU supports them!
# device_map=1#'auto',
# torch_dtype=torch.float16,
# low_cpu_mem_usage=True
)
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_length=2048,
temperature=0,
top_p=0.95,
repetition_penalty=1.15
)
local_llm = HuggingFacePipeline(pipeline=pipe)
return local_llm
@click.command()
@click.option('--device_type', default='gpu', help='device to run on, select gpu or cpu')
def main(device_type, ):
# load the instructorEmbeddings
if device_type in ['cpu', 'CPU']:
device='cpu'
else:
device='cuda'
print(f"Running on: {device}")
embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",
model_kwargs={"device": device})
# load the vectorstore
db = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
retriever = db.as_retriever()
# Prepare the LLM
# callbacks = [StreamingStdOutCallbackHandler()]
# load the LLM for generating Natural Language responses.
llm = load_model()
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
# Interactive questions and answers
while True:
query = input("\nEnter a query: ")
if query == "exit":
break
# Get the answer from the chain
res = qa(query)
answer, docs = res['result'], res['source_documents']
# Print the result
print("\n\n> Question:")
print(query)
print("\n> Answer:")
print(answer)
# # Print the relevant sources used for the answer
print("----------------------------------SOURCE DOCUMENTS---------------------------")
for document in docs:
print("\n> " + document.metadata["source"] + ":")
print(document.page_content)
print("----------------------------------SOURCE DOCUMENTS---------------------------")
if __name__ == "__main__":
main()