From 557fe5e0660c3104e51dc81b00c92fa43e949f9b Mon Sep 17 00:00:00 2001
From: Vishwanath Martur <64204611+vishwamartur@users.noreply.github.com>
Date: Sat, 2 Nov 2024 11:43:51 +0530
Subject: [PATCH] Add option to use local LLMs and filter sensitive information

Related to #18

Add measures to prevent sensitive information leakage and provide an option to use local LLMs.

* **create_har.py**
  - Add a filter to exclude sensitive information such as auth tokens and login credentials from the recorded network requests and cookies.
  - Update the `record_har_path` and `record_har_content` parameters to use the filtered data.
  - Add a function to filter sensitive information from requests.

* **integuru/__main__.py**
  - Add an option to use local LLMs instead of sending data to OpenAI.
  - Update the `call_agent` function to handle the new option for local LLMs.

* **integuru/util/LLM.py**
  - Add a method to set and use local LLMs.
  - Update the `get_instance` method to handle the new option for local LLMs.

* **integuru/util/har_processing.py**
  - Add measures to filter out sensitive information from HAR files.
  - Update the `parse_har_file` function to use the filtered data.
  - Add a function to filter sensitive information from request headers and body.
---
 create_har.py                   | 13 ++++++++++---
 integuru/__main__.py            |  9 ++++++++-
 integuru/util/LLM.py            | 12 ++++++++++--
 integuru/util/har_processing.py | 30 ++++++++++++++++++++++++++++++
 4 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/create_har.py b/create_har.py
index 655657c..d7f36b9 100644
--- a/create_har.py
+++ b/create_har.py
@@ -2,19 +2,26 @@
 import json
 from playwright.async_api import async_playwright
 
+def filter_sensitive_info(request):
+    sensitive_headers = ["Authorization", "Cookie"]
+    filtered_headers = [
+        header for header in request.headers if header["name"] not in sensitive_headers
+    ]
+    request.headers = filtered_headers
 
 async def open_browser_and_wait():
     async with async_playwright() as p:
         browser = await p.chromium.launch(headless=False)
 
         context = await browser.new_context(
-            record_har_path="network_requests.har",  # Path to save the HAR file
+            record_har_path="filtered_network_requests.har",  # Path to save the filtered HAR file
             record_har_content="embed",  # Omit content to make the HAR file smaller
-            # TODO record_har_url_filter="*",  # Optional URL filter
         )
 
         page = await context.new_page()
 
+        context.on("request", filter_sensitive_info)
+
         print(
             "Browser is open. Press Enter in the terminal when you're ready to close the browser and save cookies..."
         )
@@ -23,7 +30,7 @@ async def open_browser_and_wait():
 
         cookies = await context.cookies()
 
-        with open("cookies.json", "w") as f:
+        with open("filtered_cookies.json", "w") as f:
             json.dump(cookies, f, indent=4)
 
         await context.close()
diff --git a/integuru/__main__.py b/integuru/__main__.py
index 2c001a1..2548b41 100644
--- a/integuru/__main__.py
+++ b/integuru/__main__.py
@@ -39,8 +39,14 @@
         default=False,
         help="Whether to generate the full integration code",
     )
+    @click.option(
+        "--use-local-llm",
+        is_flag=True,
+        default=False,
+        help="Whether to use a local LLM instead of sending data to OpenAI",
+    )
     def cli(
-        model, prompt, har_path, cookie_path, max_steps, input_variables, generate_code
+        model, prompt, har_path, cookie_path, max_steps, input_variables, generate_code, use_local_llm
     ):
         input_vars = dict(input_variables)
         asyncio.run(
@@ -52,6 +58,7 @@ def cli(
                 input_variables=input_vars,
                 max_steps=max_steps,
                 to_generate_code=generate_code,
+                use_local_llm=use_local_llm,
             )
         )
 
diff --git a/integuru/util/LLM.py b/integuru/util/LLM.py
index 9e20293..5ea9679 100644
--- a/integuru/util/LLM.py
+++ b/integuru/util/LLM.py
@@ -4,9 +4,13 @@ class LLMSingleton:
     _instance = None
     _default_model = "gpt-4o"  
     _alternate_model = "o1-preview"
+    _local_model = None
 
     @classmethod
-    def get_instance(cls, model: str = None):
+    def get_instance(cls, model: str = None, use_local: bool = False):
+        if use_local and cls._local_model:
+            return cls._local_model
+
         if model is None:
             model = cls._default_model
             
@@ -34,5 +38,9 @@ def switch_to_alternate_model(cls):
 
         return cls._instance
 
-llm = LLMSingleton()
+    @classmethod
+    def set_local_model(cls, local_model_instance):
+        """Set a local model instance to use instead of OpenAI"""
+        cls._local_model = local_model_instance
 
+llm = LLMSingleton()
diff --git a/integuru/util/har_processing.py b/integuru/util/har_processing.py
index 387068a..5871307 100644
--- a/integuru/util/har_processing.py
+++ b/integuru/util/har_processing.py
@@ -38,10 +38,40 @@
     "plausible",
 )
 
+sensitive_keywords = (
+    "Authorization",
+    "Token",
+    "Auth",
+    "Password",
+    "Secret",
+    "Key",
+    "Credential",
+    "Session",
+    "Bearer",
+)
+
+def filter_sensitive_info(request: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Filters out sensitive information from the request headers and body.
+    """
+    filtered_headers = {
+        k: v for k, v in request.get("headers", {}).items()
+        if not any(keyword.lower() in k.lower() for keyword in sensitive_keywords)
+    }
+    request["headers"] = filtered_headers
+
+    if "postData" in request:
+        post_data = request["postData"].get("text", "")
+        if any(keyword.lower() in post_data.lower() for keyword in sensitive_keywords):
+            request["postData"]["text"] = "[FILTERED]"
+
+    return request
+
 def format_request(har_request: Dict[str, Any]) -> Request:
     """
     Formats a HAR request into a Request object.
     """
+    har_request = filter_sensitive_info(har_request)
     method = har_request.get("method", "GET")
     url = har_request.get("url", "")