Rate Limiter, Models, Langchain update (#7)

* Update various package versions * Update splitter modules * Use updated langchain document loaders. * Address relative import statement. * Adding langchain-chroma dependency. * Refer to update langchain-chroma module. * Refer to updated langchain openai package. * Refer to updated embedding modules. * Unit test based corrections; now using current Langchain embeddings and vector store. * Argument not optional. * Gitignore aider folder. * Updated langchain references. * Use community Replicate model. * Update replicate based model. * New langchain packages for hugging face. * Adding Gemini LLM * Adding mistral LLM. * Adding LiteLLM * Setup anthropic through LiteLLM. * Adding Claud 3 * New chat function using Langchain Expression Language. But issues (related to embedding+source reference in output). * Include RAG source reference in MD, HTML and logging output. (This is different with LCEL changes). * Update poetry packages * Remove model kwargs from embedding settings. * Remove chat function version no longer used. * Add in settings update * Update import from quke (instead of .) * Adding basic rate limiting functionality. * Adding LLM rate limiter functionality based on Langchain rate limiter. * Config update * Removing debug code. * Removing old code. * Removing debug code. Adding rate limiter to Gemini. * Logging messages. * Expand rate_limit config settings. * Summary description update. --------- Co-authored-by: Erik Oosterop <[email protected]>
EJOOSTEROP · Aug 2, 2024 · 0424c84 · 0424c84
1 parent 42e2c29
commit 0424c84
Show file tree

Hide file tree

Showing 25 changed files with 4,347 additions and 1,612 deletions.
diff --git a/.env.example b/.env.example
@@ -2,4 +2,6 @@ OPENAI_API_KEY = "YOUR-KEY"
 HUGGINGFACEHUB_API_TOKEN = "YOUR-KEY"
 COHERE_API_KEY = "YOUR-KEY"
 REPLICATE_API_TOKEN = "YOUR-KEY"
-GOOGLE_API_KEY = "YOUR-KEY"
+GOOGLE_API_KEY = "YOUR-KEY"
+MISTRAL_API_KEY = "YOUR-KEY"
+ANTHROPIC_API_KEY = "YOUR-KEY"
diff --git a/.gitignore b/.gitignore
@@ -174,3 +174,4 @@ DEL_setup.py
 readme - SAMPLE.md
 quke/temp.yaml
 quke/wip.py
+.aider*
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -12,7 +12,7 @@
         "editor.defaultFormatter": "ms-python.black-formatter",
         "editor.formatOnSave": true,
         "editor.codeActionsOnSave": {
-            "source.organizeImports": true
+            "source.organizeImports": "explicit"
         },
     },
     "isort.args":["--profile", "black"],

diff --git a/README.md b/README.md
@@ -49,7 +49,7 @@
 <h3 align="center">quke</h3>
 
   <p align="center">
-    Compare the answering capabilities of different LLMs - for example LlaMa, ChatGPT, Cohere, Falcon - against user provided document(s) and questions.
+    Compare the answering capabilities of different LLMs - for example LlaMa, GPT4o, Mistral, Claude, Cohere, others - against user provided document(s) and questions.
     <br />
     <a href="https://github.com/ejoosterop/quke"><strong>Explore the docs »</strong></a>
     <br />

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [tool.poetry]
 name = "quke"
-version = "0.5.1"
-description = "Compare the answering capabilities of different LLMs - for example LlaMa, ChatGPT, Cohere, Falcon - against user provided document(s) and questions."
+version = "0.6.0"
+description = "Compare the answering capabilities of different LLMs - for example LlaMa, GPT4o, Mistral, Claude, Cohere, others - against user provided document(s) and questions."
 authors = ["Erik Oosterop"]
 maintainers = ["Erik Oosterop"]
 license = "MIT"
@@ -17,20 +17,23 @@ quke = "quke.quke:quke"
 
 [tool.poetry.dependencies]
 python = "^3.11"
-hydra-core = "^1.3.2"
-python-dotenv = "^1.0.0"
-pymupdf = "^1.22.5"
-pypdf = "^3.12.1"
-tiktoken = "^0.4.0"
 lark = "^1.1.7"
-huggingface-hub = "^0.16.4"
-openai = "^0.27.8"
-cohere = "^4.17.0"
-replicate = "^0.9.0"
 rich = "^13.5.2"
-jinja2 = "^3.1.2"
-langchain = "^0.0.285"
-chromadb = "^0.4.9"
+pymupdf = "^1.24.9"
+langchain-community = "^0.2.10"
+langchain-core = "^0.2.25"
+jinja2 = "^3.1.4"
+python-dotenv = "^1.0.1"
+hydra-core = "^1.3.2"
+langchain-chroma = "^0.1.2"
+langchain-openai = "^0.1.19"
+sentence-transformers = "^3.0.1"
+langchain-huggingface = "^0.0.3"
+langchain-cohere = "^0.1.9"
+replicate = "^0.30.1"
+langchain-google-genai = "^1.0.8"
+langchain-mistralai = "^0.1.11"
+litellm = "^1.42.5"
 
 [tool.poetry.group.dev.dependencies]
 pytest-cov = "^4.1.0"

diff --git a/quke/conf/config.yaml b/quke/conf/config.yaml
@@ -4,6 +4,17 @@ experiment_summary_file: 'chat_session.md'
 
 embed_only: False
 
+# The parameters refer to langchain_core.rate_limiters.InMemoryRateLimiter
+rate_limiters:
+  - gemini:
+      requests_per_second: 0.03
+      check_every_n_seconds: 10
+  - openai:
+      requests_per_second: 8
+      check_every_n_seconds: 5
+  - cohere:
+      requests_per_second: 0.1
+
 defaults:
   - _self_
   - llm: cohere

diff --git a/quke/conf/embedding/cohere.yaml b/quke/conf/embedding/cohere.yaml
@@ -1,5 +1,5 @@
 vectordb:
-  module_name: langchain.vectorstores
+  module_name: langchain_chroma
   class_name: Chroma
   vectorstore_location: vector_store/chromadb_cohere
 
@@ -8,18 +8,19 @@ vectordb:
   # -If the folder exists and 'no_overwrite' is specified: document will not be embedded
   # -If the folder exists and 'overwrite' is specified, all contents of the vectordb folder will be deleted and a new vectordb will be created.
   # -If set to 'append' the new embeddings will be appended to any existing vectordb. If a source document is specified twice it will be embedded twice.
-  vectorstore_write_mode: no_overwrite
+  vectorstore_write_mode: overwrite
 
 embedding:
-  module_name: langchain.embeddings
+  module_name: langchain_cohere
   class_name: CohereEmbeddings
-  kwargs: #optional
-#    repo_id: sentence-transformers/all-mpnet-base-v2
+  kwargs:
+    # model: embed-english-light-v3.0
+    # model: embed-english-v3.0
   rate_limit_chunks: 300 # max about 200 when I trialed (free account). Must depend on many considerations.
   rate_limit_delay: 60 # in seconds
 
 splitter:
-  module_name: langchain.text_splitter
+  module_name: langchain_text_splitters
   class_name: CharacterTextSplitter
   args:
     chunk_size: 1000

diff --git a/quke/conf/embedding/hf_r.yaml b/quke/conf/embedding/hf_r.yaml
@@ -1,5 +1,5 @@
 vectordb:
-  module_name: langchain.vectorstores
+  module_name: langchain_chroma
   class_name: Chroma
   vectorstore_location: vector_store/chromadb_hf_recursive
 
@@ -8,18 +8,18 @@ vectordb:
   # -If the folder exists and 'no_overwrite' is specified: document will not be embedded
   # -If the folder exists and 'overwrite' is specified, all contents of the vectordb folder will be deleted and a new vectordb will be created.
   # -If set to 'append' the new embeddings will be appended to any existing vectordb. If a source document is specified twice it will be embedded twice.
-  vectorstore_write_mode: overwrite
+  vectorstore_write_mode: no_overwrite
 
 embedding:
-  module_name: langchain.embeddings
-  class_name: HuggingFaceHubEmbeddings
+  module_name: langchain_huggingface.embeddings
+  class_name: HuggingFaceEmbeddings
   kwargs: #optional
-    repo_id: sentence-transformers/all-mpnet-base-v2
+#    repo_id: sentence-transformers/all-mpnet-base-v2
   rate_limit_chunks: 201 # max about 200 when I trialed (free account). Must depend on many considerations.
   rate_limit_delay: 306 # in seconds
 
 splitter:
-  module_name: langchain.text_splitter
+  module_name: langchain_text_splitters
   class_name: RecursiveCharacterTextSplitter
   args:
     chunk_size: 800

diff --git a/quke/conf/embedding/huggingface.yaml b/quke/conf/embedding/huggingface.yaml
@@ -1,5 +1,5 @@
 vectordb:
-  module_name: langchain.vectorstores
+  module_name: langchain_chroma
   class_name: Chroma
   vectorstore_location: vector_store/chromadb_hf_del
 
@@ -11,15 +11,15 @@ vectordb:
   vectorstore_write_mode: no_overwrite
 
 embedding:
-  module_name: langchain.embeddings
-  class_name: HuggingFaceHubEmbeddings
+  module_name: langchain_huggingface.embeddings
+  class_name: HuggingFaceEmbeddings
   kwargs: #optional
-    repo_id: sentence-transformers/all-mpnet-base-v2
+#    repo_id: sentence-transformers/all-mpnet-base-v2
   rate_limit_chunks: 201 # max about 200 when I trialed (free account). Must depend on many considerations.
   rate_limit_delay: 306 # in seconds
 
 splitter:
-  module_name: langchain.text_splitter
+  module_name: langchain_text_splitters
   class_name: CharacterTextSplitter
   args:
     chunk_size: 1000

diff --git a/quke/conf/embedding/openai.yaml b/quke/conf/embedding/openai.yaml
@@ -1,5 +1,5 @@
 vectordb:
-  module_name: langchain.vectorstores
+  module_name: langchain_chroma
   class_name: Chroma
   vectorstore_location: vector_store/chromadb_openai
 
@@ -8,18 +8,18 @@ vectordb:
   # -If the folder exists and 'no_overwrite' is specified: document will not be embedded
   # -If the folder exists and 'overwrite' is specified, all contents of the vectordb folder will be deleted and a new vectordb will be created.
   # -If set to 'append' the new embeddings will be appended to any existing vectordb. If a source document is specified twice it will be embedded twice.
-  vectorstore_write_mode: overwrite
+  vectorstore_write_mode: no_overwrite
 
 embedding:
-  module_name: langchain.embeddings
+  module_name: langchain_openai
   class_name: OpenAIEmbeddings
   kwargs: #optional
-#    repo_id: sentence-transformers/all-mpnet-base-v2
+    # model: text-embedding-3-large
   rate_limit_chunks: 200 # max about 200 when I trialed (free account). Must depend on many considerations.
   rate_limit_delay: 60 # in seconds
 
 splitter:
-  module_name: langchain.text_splitter
+  module_name: langchain_text_splitters
   class_name: CharacterTextSplitter
   args:
     chunk_size: 1000

diff --git a/quke/conf/llm/cohere.yaml b/quke/conf/llm/cohere.yaml
@@ -1,6 +1,10 @@
-module_name_llm: langchain.llms
-class_name_llm: Cohere
-name: command-nightly
+module_name_llm: langchain_cohere
+class_name_llm: ChatCohere
+#name: command-nightly
+name: command-r-plus
+
+# rate_limiter is optional. If it exists it needs to refer to a limiter defined in config.yaml.
+rate_limiter: cohere
 
 llm_args:
   model: ${llm.name}

diff --git a/quke/conf/llm/falcon7b.yaml b/quke/conf/llm/falcon7b.yaml
@@ -1,11 +1,14 @@
-module_name_llm: langchain.llms
-class_name_llm: HuggingFaceHub
+module_name_llm: langchain_huggingface
+class_name_llm: HuggingFaceEndpoint
 name: tiiuae/falcon-7b-instruct
 
+# rate_limiter is optional. If it exists it needs to refer to a limiter defined in config.yaml.
+rate_limiter: none
+
 llm_args:
   repo_id: ${llm.name}
+  temperature: 0.1
+  max_new_tokens: 512  
   model_kwargs: 
     max_length: 2000
-    temperature: 0.5
-    max_new_tokens: 200
     num_return_sequences: 3
diff --git a/quke/conf/llm/gemini.yaml b/quke/conf/llm/gemini.yaml
@@ -0,0 +1,10 @@
+module_name_llm: langchain_google_genai
+class_name_llm: ChatGoogleGenerativeAI
+name: gemini-1.5-pro
+
+# rate_limiter is optional. If it exists it needs to refer to a limiter defined in config.yaml.
+rate_limiter: gemini
+
+llm_args:
+  model: ${llm.name}
+  temperature: 0
diff --git a/quke/conf/llm/gpt4o.yaml b/quke/conf/llm/gpt4o.yaml
@@ -1,8 +1,10 @@
-#type: OpenAI #import OpenAI maybe needed to set the API key, though langchain may just take it from env?
-module_name_llm: langchain.chat_models
+module_name_llm: langchain_openai
 class_name_llm: ChatOpenAI
 name: gpt-4o-mini
 
+# rate_limiter is optional. If it exists it needs to refer to a limiter defined in config.yaml.
+rate_limiter: openai
+
 llm_args:
   model_name: ${llm.name}
   temperature: 0
diff --git a/quke/conf/llm/litellm.yaml b/quke/conf/llm/litellm.yaml
@@ -0,0 +1,11 @@
+module_name_llm: langchain_community.chat_models
+class_name_llm: ChatLiteLLM
+#name: anthropic/claude-3-5-sonnet-20240620
+name: anthropic/claude-3
+
+# rate_limiter is optional. If it exists it needs to refer to a limiter defined in config.yaml.
+rate_limiter: none
+
+llm_args:
+  model_name: ${llm.name}
+  temperature: 0
diff --git a/quke/conf/llm/llama2.yaml b/quke/conf/llm/llama2.yaml
@@ -1,12 +1,15 @@
-module_name_llm: langchain.llms
+module_name_llm: langchain_community.llms
 class_name_llm: Replicate
 name: a16z-infra/llama-2-7b-chat:5ec5fdadd80ace49f5a2b2178cceeb9f2f77c493b85b1131002c26e6b2b13184
 #name: a16z-infra/llama13b-v2-chat:df7690f1994d94e96ad9d568eac121aecf50684a0b0963b25a41cc40061269e5
 
+# rate_limiter is optional. If it exists it needs to refer to a limiter defined in config.yaml.
+rate_limiter: none
+
 llm_args:
   model: ${llm.name}
   input:
-    temperature: 0.75
+    temperature: 0.1
     max_length: 500
     max_new_tokens: 500
     min_new_tokens: -1

diff --git a/quke/conf/llm/mistral.yaml b/quke/conf/llm/mistral.yaml
@@ -0,0 +1,11 @@
+module_name_llm: langchain_mistralai.chat_models
+class_name_llm: ChatMistralAI
+name: mistral-large-latest
+# mistral-large-2407
+
+# rate_limiter is optional. If it exists it needs to refer to a limiter defined in config.yaml.
+rate_limiter: none
+
+llm_args:
+  model: ${llm.name}
+  temperature: 0
diff --git a/quke/embed.py b/quke/embed.py
@@ -10,9 +10,9 @@
 from typing import Iterator
 
 # [ ] TODO: PyMU is faster, PyPDF more accurate: https://github.com/py-pdf/benchmarks
-from langchain.document_loaders import CSVLoader, PyMuPDFLoader, TextLoader
+from langchain_community.document_loaders import CSVLoader, PyMuPDFLoader, TextLoader
 
-from . import ClassImportDefinition, ClassRateLimit, DatabaseAction
+from quke import ClassImportDefinition, ClassRateLimit, DatabaseAction
 
 
 @dataclass
@@ -239,12 +239,10 @@ def embed_these_chunks(
     class_ = getattr(module, vectordb_import.class_name)
     vectordb_type = class_()
 
-    vectordb = vectordb_type.from_documents(
+    _ = vectordb_type.from_documents(
         documents=chunks, embedding=embedding, persist_directory=vectordb_location
     )
 
-    vectordb.persist()
-
     logging.info(f"{len(chunks)} chunks persisted into database at {vectordb_location}")
 
     return len(chunks)