From 557fe5e0660c3104e51dc81b00c92fa43e949f9b Mon Sep 17 00:00:00 2001 From: Vishwanath Martur <64204611+vishwamartur@users.noreply.github.com> Date: Sat, 2 Nov 2024 11:43:51 +0530 Subject: [PATCH] Add option to use local LLMs and filter sensitive information Related to #18 Add measures to prevent sensitive information leakage and provide an option to use local LLMs. * **create_har.py** - Add a filter to exclude sensitive information such as auth tokens and login credentials from the recorded network requests and cookies. - Update the `record_har_path` and `record_har_content` parameters to use the filtered data. - Add a function to filter sensitive information from requests. * **integuru/__main__.py** - Add an option to use local LLMs instead of sending data to OpenAI. - Update the `call_agent` function to handle the new option for local LLMs. * **integuru/util/LLM.py** - Add a method to set and use local LLMs. - Update the `get_instance` method to handle the new option for local LLMs. * **integuru/util/har_processing.py** - Add measures to filter out sensitive information from HAR files. - Update the `parse_har_file` function to use the filtered data. - Add a function to filter sensitive information from request headers and body. --- create_har.py | 13 ++++++++++--- integuru/__main__.py | 9 ++++++++- integuru/util/LLM.py | 12 ++++++++++-- integuru/util/har_processing.py | 30 ++++++++++++++++++++++++++++++ 4 files changed, 58 insertions(+), 6 deletions(-) diff --git a/create_har.py b/create_har.py index 655657c..d7f36b9 100644 --- a/create_har.py +++ b/create_har.py @@ -2,19 +2,26 @@ import json from playwright.async_api import async_playwright +def filter_sensitive_info(request): + sensitive_headers = ["Authorization", "Cookie"] + filtered_headers = [ + header for header in request.headers if header["name"] not in sensitive_headers + ] + request.headers = filtered_headers async def open_browser_and_wait(): async with async_playwright() as p: browser = await p.chromium.launch(headless=False) context = await browser.new_context( - record_har_path="network_requests.har", # Path to save the HAR file + record_har_path="filtered_network_requests.har", # Path to save the filtered HAR file record_har_content="embed", # Omit content to make the HAR file smaller - # TODO record_har_url_filter="*", # Optional URL filter ) page = await context.new_page() + context.on("request", filter_sensitive_info) + print( "Browser is open. Press Enter in the terminal when you're ready to close the browser and save cookies..." ) @@ -23,7 +30,7 @@ async def open_browser_and_wait(): cookies = await context.cookies() - with open("cookies.json", "w") as f: + with open("filtered_cookies.json", "w") as f: json.dump(cookies, f, indent=4) await context.close() diff --git a/integuru/__main__.py b/integuru/__main__.py index 2c001a1..2548b41 100644 --- a/integuru/__main__.py +++ b/integuru/__main__.py @@ -39,8 +39,14 @@ default=False, help="Whether to generate the full integration code", ) + @click.option( + "--use-local-llm", + is_flag=True, + default=False, + help="Whether to use a local LLM instead of sending data to OpenAI", + ) def cli( - model, prompt, har_path, cookie_path, max_steps, input_variables, generate_code + model, prompt, har_path, cookie_path, max_steps, input_variables, generate_code, use_local_llm ): input_vars = dict(input_variables) asyncio.run( @@ -52,6 +58,7 @@ def cli( input_variables=input_vars, max_steps=max_steps, to_generate_code=generate_code, + use_local_llm=use_local_llm, ) ) diff --git a/integuru/util/LLM.py b/integuru/util/LLM.py index 9e20293..5ea9679 100644 --- a/integuru/util/LLM.py +++ b/integuru/util/LLM.py @@ -4,9 +4,13 @@ class LLMSingleton: _instance = None _default_model = "gpt-4o" _alternate_model = "o1-preview" + _local_model = None @classmethod - def get_instance(cls, model: str = None): + def get_instance(cls, model: str = None, use_local: bool = False): + if use_local and cls._local_model: + return cls._local_model + if model is None: model = cls._default_model @@ -34,5 +38,9 @@ def switch_to_alternate_model(cls): return cls._instance -llm = LLMSingleton() + @classmethod + def set_local_model(cls, local_model_instance): + """Set a local model instance to use instead of OpenAI""" + cls._local_model = local_model_instance +llm = LLMSingleton() diff --git a/integuru/util/har_processing.py b/integuru/util/har_processing.py index 387068a..5871307 100644 --- a/integuru/util/har_processing.py +++ b/integuru/util/har_processing.py @@ -38,10 +38,40 @@ "plausible", ) +sensitive_keywords = ( + "Authorization", + "Token", + "Auth", + "Password", + "Secret", + "Key", + "Credential", + "Session", + "Bearer", +) + +def filter_sensitive_info(request: Dict[str, Any]) -> Dict[str, Any]: + """ + Filters out sensitive information from the request headers and body. + """ + filtered_headers = { + k: v for k, v in request.get("headers", {}).items() + if not any(keyword.lower() in k.lower() for keyword in sensitive_keywords) + } + request["headers"] = filtered_headers + + if "postData" in request: + post_data = request["postData"].get("text", "") + if any(keyword.lower() in post_data.lower() for keyword in sensitive_keywords): + request["postData"]["text"] = "[FILTERED]" + + return request + def format_request(har_request: Dict[str, Any]) -> Request: """ Formats a HAR request into a Request object. """ + har_request = filter_sensitive_info(har_request) method = har_request.get("method", "GET") url = har_request.get("url", "")