Create o1_web_extractor.py

poggiolabs · Dec 18, 2024 · 94267ff · 94267ff
1 parent e899ecb
commit 94267ff
Showing 1 changed file with 147 additions and 0 deletions.
diff --git a/examples/o1_web_extractor/o1_web_extractor.py b/examples/o1_web_extractor/o1_web_extractor.py
@@ -0,0 +1,147 @@
+import os
+import json
+import requests
+from dotenv import load_dotenv
+from openai import OpenAI
+from serpapi import GoogleSearch
+
+# ANSI color codes
+class Colors:
+    CYAN = '\033[96m'
+    YELLOW = '\033[93m'
+    GREEN = '\033[92m'
+    RED = '\033[91m'
+    MAGENTA = '\033[95m'
+    BLUE = '\033[94m'
+    RESET = '\033[0m'
+
+# Load environment variables
+load_dotenv()
+
+# Initialize clients
+client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
+
+def search_google(query):
+    """Search Google using SerpAPI and return top results."""
+    print(f"{Colors.YELLOW}Searching Google for '{query}'...{Colors.RESET}")
+    search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")})
+    return search.get_dict().get("organic_results", [])
+
+def select_urls_with_o1(company, objective, serp_results):
+    """
+    Use O1 to select the most relevant URLs from SERP results for the given company and objective.
+    Returns a JSON object with a "selected_urls" property that is an array of strings.
+    """
+    try:
+        # Prepare the data for O1
+        serp_data = [{"title": r.get("title"), "link": r.get("link"), "snippet": r.get("snippet")} 
+                     for r in serp_results if r.get("link")]
+
+        response = client.chat.completions.create(
+            model="o1-2024-12-17",
+            messages=[
+                {
+                    "role": "developer",
+                    "content": "You select URLs from the SERP results relevant to the company and objective."
+                },
+                {
+                    "role": "user",
+                    "content": (
+                        f"Company: {company}\n"
+                        f"Objective: {objective}\n"
+                        f"SERP Results: {json.dumps(serp_data)}\n\n"
+                        "Return a JSON object with a property 'selected_urls' that contains an array "
+                        "of URLs most likely to help meet the objective. Add a /* to the end of the URL if you think it should search all of the pages in the site. Do not return any social media links. For example: {\"selected_urls\": [\"https://example.com\", \"https://example2.com\"]}"
+                    )
+                }
+            ],
+            response_format={
+                "type": "json_schema",
+                "json_schema": {
+                    "name": "selected_urls_object",
+                    "schema": {
+                        "type": "object",
+                        "properties": {
+                            "selected_urls": {
+                                "type": "array",
+                                "items": {
+                                    "type": "string"
+                                }
+                            }
+                        },
+                        "required": ["selected_urls"],
+                        "additionalProperties": False
+                    }
+                }
+            }
+        )
+
+        # The response is guaranteed to follow the specified JSON schema
+        result = json.loads(response.choices[0].message.content)
+        urls = result.get("selected_urls", [])
+        return urls
+
+    except Exception as e:
+        print(f"{Colors.RED}Error selecting URLs with O1: {e}{Colors.RESET}")
+        return []
+
+
+
+def extract_company_info(urls, prompt, company, api_key):
+    """Use requests to call Firecrawl's extract endpoint with selected URLs."""
+    print(f"{Colors.YELLOW}Extracting structured data from the provided URLs using Firecrawl...{Colors.RESET}")
+
+    headers = {
+        'Content-Type': 'application/json',
+        'Authorization': f'Bearer {api_key}'
+    }
+
+    payload = {
+        "urls": urls,
+        "prompt": prompt + " for " + company
+    }
+
+    try:
+        response = requests.post(
+            "https://api.firecrawl.dev/v1/extract",
+            headers=headers,
+            json=payload
+        )
+        response.raise_for_status()
+        data = response.json()
+        return data
+    except Exception as e:
+        print(f"{Colors.RED}Failed to extract data: {e}{Colors.RESET}")
+        return None
+
+def main():
+    company = input(f"{Colors.BLUE}Enter the company name: {Colors.RESET}")
+    objective = input(f"{Colors.BLUE}Enter what information you want about the company: {Colors.RESET}")
+
+    serp_results = search_google(f"{company}")
+    if not serp_results:
+        print(f"{Colors.RED}No search results found.{Colors.RESET}")
+        return
+
+    # Ask O1 to select URLs
+    selected_urls = select_urls_with_o1(company, objective, serp_results)
+
+    if not selected_urls:
+        print(f"{Colors.RED}O1 did not return any URLs.{Colors.RESET}")
+        return
+
+    print(f"{Colors.CYAN}Selected URLs for extraction by O1:{Colors.RESET}")
+    for url in selected_urls:
+        print(f"- {url}")
+
+    data = extract_company_info(selected_urls, objective, company, firecrawl_api_key)
+
+    if data and data.get('success') and data.get('data'):
+        print(f"{Colors.GREEN}Data successfully extracted:{Colors.RESET}")
+        print(json.dumps(data['data'], indent=2))
+    else:
+        print(f"{Colors.RED}Failed to extract the requested information. Try refining your prompt or choosing a different company.{Colors.RESET}")
+
+if __name__ == "__main__":
+    main()