CapDavyJones
diff --git a/‎.gitignore
+5-1 b/‎.gitignore
+5-1
diff --git a/‎.vscode/launch.json
+42-11 b/‎.vscode/launch.json
+42-11
diff --git a/‎README.md
+80-125 b/‎README.md
+80-125
diff --git a/‎browser_use/__init__.py
+7-3 b/‎browser_use/__init__.py
+7-3
@@ -160,4 +160,8 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
-temp
+temp
+tmp
+
+
+.DS_Store
@@ -2,20 +2,51 @@
     "version": "0.2.0",
     "configurations": [
         {
-            "name": "Python: Debug Tests",
+            "name": "Python Debugger: Module",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "examples.extend_actions"
+        },
+        {
+            "name": "Python: Debug extend_actions",
+            "type": "module",
+            "request": "launch",
+            "module": "examples.extend_actions",
+            "console": "integratedTerminal",
+            "justMyCode": false,
+            "env": {
+                "PYTHONPATH": "${workspaceFolder}"
+            }
+        },
+        {
+            "name": "Python: Debug Captcha Tests",
+            "type": "python",
+            "request": "launch",
+            "module": "pytest",
+            "args": [
+                "tests/test_agent_actions.py",
+                "-v",
+                "-k",
+                "test_captcha_solver",
+                "--capture=no",
+            ],
+            "console": "integratedTerminal",
+            "justMyCode": false
+        },
+        {
+            "name": "Python: Debug Ecommerce Interaction",
             "type": "python",
             "request": "launch",
-            "program": "${workspaceFolder}/.venv/bin/pytest",
+            "module": "pytest",
             "args": [
-                "src/tests/test_kayak_search.py",
+                "tests/test_agent_actions.py",
                 "-v",
-                "-s"
+                "-k",
+                "test_ecommerce_interaction",
+                "--capture=no",
             ],
             "console": "integratedTerminal",
-			"justMyCode": false,
-			"env": {
-				"PYTHONPATH": "${workspaceFolder}"
-			}
-		}
-	]
-}
+            "justMyCode": false
+        }
+    ]
+}
@@ -1,194 +1,149 @@
-<div align="center">
-
-# 🌐 Browser-Use
+# 🌐 Browser Use
 
-### Open-Source Web Automation with LLMs
+Make websites accessible for AI agents 🤖.
 
 [![GitHub stars](https://img.shields.io/github/stars/gregpr07/browser-use?style=social)](https://github.com/gregpr07/browser-use/stargazers)
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 [![Python 3.11+](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org/downloads/)
-[![Discord](https://img.shields.io/discord/1303749220842340412?color=7289DA&label=Discord&logo=discord&logoColor=white)](https://discord.gg/uaCtrbbv)
+[![Discord](https://img.shields.io/discord/1303749220842340412?color=7289DA&label=Discord&logo=discord&logoColor=white)](https://link.browser-use.com/discord)
 
-</div>
+Browser use is the easiest way to connect your AI agents with the browser. If you have used Browser Use for your project feel free to show it off in our [Discord](https://link.browser-use.com/discord).
 
-Let LLMs interact with websites through a simple interface.
+# Quick start
 
-## Short Example
+With pip:
 
 ```bash
 pip install browser-use
 ```
 
+Spin up your agent:
+
 ```python
 from langchain_openai import ChatOpenAI
 from browser_use import Agent
 
 agent = Agent(
-    task="Go to hackernews on show hn and give me top 10 post titles, their points and hours. Calculate for each the ratio of points per hour.",
+    task="Find a one-way flight from Bali to Oman on 12 January 2025 on Google Flights. Return me the cheapest option.",
     llm=ChatOpenAI(model="gpt-4o"),
 )
 
 # ... inside an async function
 await agent.run()
 ```
 
-## Demo
-
-<div>
-    <a href="https://www.loom.com/share/63612b5994164cb1bb36938d62fe9983">
-      <img style="max-width:300px;" src="https://cdn.loom.com/sessions/thumbnails/63612b5994164cb1bb36938d62fe9983-7133f9e169672e6f-full-play.gif">
-    </a>
-    <p><i>Prompt: Go to hackernews on show hn and give me top 10 post titles, their points and hours. Calculate for each the ratio of points per hour. (1x speed) </i></p>
-</div>
-<div>
-    <a href="https://www.loom.com/share/2af938b9f8024647950a9e18b3946054">
-      <img style="max-width:300px;" src="https://cdn.loom.com/sessions/thumbnails/2af938b9f8024647950a9e18b3946054-b99c733cf670e568-full-play.gif">
-    </a>
-    <p><i>Prompt: Search the top 3 AI companies 2024 and find what out what concrete hardware each is using for their model. (1x speed)</i></p>
-</div>
-  
-
-
-<div style="display: flex; justify-content: space-between; margin-top: 20px;">
-    <div style="flex: 1; margin-right: 10px;">
-        <img style="width: 100%;" src="./static/kayak.gif" alt="Kayak flight search demo">
-        <p><i>Prompt: Go to kayak.com and find a one-way flight from Zürich to San Francisco on 12 January 2025. (2.5x speed)</i></p>
-    </div>
-    <div style="flex: 1; margin-left: 10px;">
-        <img style="width: 100%;" src="./static/photos.gif" alt="Photos search demo">
-        <p><i>Prompt: Opening new tabs and searching for images for these people: Albert Einstein, Oprah Winfrey, Steve Jobs. (2.5x speed)</i></p>
-    </div>
-</div>
-</div>
-
-## Local Setup
-
-1. Create a virtual environment and install dependencies:
+And don't forget to add your API keys to your `.env` file.
 
 ```bash
-# To install all dependencies including dev
-pip install . ."[dev]"
+OPENAI_API_KEY=
+ANTHROPIC_API_KEY=
 ```
 
-2. Add your API keys to the `.env` file:
+# Demo
 
-```bash
-cp .env.example .env
-```
+DEMO VIDEO HERE
 
-E.g. for OpenAI:
+# Features ⭐
 
-```bash
-OPENAI_API_KEY=
-```
+- Vision + html extraction
+- Automatic multi-tab management
+- Extract clicked elements XPaths
+- Add custom actions (e.g. add data to database which the LLM can use)
+- Self-correcting
+- Use any LLM supported by LangChain (e.g. gpt4o, gpt4o mini, claude 3.5 sonnet, llama 3.1 405b, etc.)
 
-You can use any LLM model supported by LangChain by adding the appropriate environment variables. See [langchain models](https://python.langchain.com/docs/integrations/chat/) for available options.
+## Register custom actions
 
-## Features
+If you want to add custom actions your agent can take, you can register them like this:
 
-- Universal LLM Support - Works with any Language Model
-- Interactive Element Detection - Automatically finds interactive elements
-- Multi-Tab Management - Seamless handling of browser tabs
-- XPath Extraction for scraping functions - No more manual DevTools inspection
-- Vision Model Support - Process visual page information
-- Customizable Actions - Add your own browser interactions (e.g. add data to database which the LLM can use)
-- Handles dynamic content - dont worry about cookies or changing content
-- Chain-of-thought prompting with memory - Solve long-term tasks
-- Self-correcting - If the LLM makes a mistake, the agent will self-correct its actions
+```python
+from browser_use.agent.service import Agent
+from browser_use.browser.service import Browser
+from browser_use.controller.service import Controller
 
-## Advanced Examples
+# Initialize controller first
+controller = Controller()
 
-### Chain of Agents
+@controller.action('Ask user for information')
+def ask_human(question: str, display_question: bool) -> str:
+	return input(f'\n{question}\nInput: ')
+```
 
-You can persist the browser across multiple agents and chain them together.
+Or define your parameters using Pydantic
 
 ```python
-from asyncio import run
-from browser_use import Agent, Controller
-from dotenv import load_dotenv
-from langchain_anthropic import ChatAnthropic
-load_dotenv()
-
-# Persist browser state across agents
-controller = Controller()
+class JobDetails(BaseModel):
+title: str
+company: str
+job_link: str
+salary: Optional[str] = None
+
+@controller.action('Save job details which you found on page', param_model=JobDetails, requires_browser=True)
+def save_job(params: JobDetails, browser: Browser):
+	print(params)
+
+  # use the browser normally
+  browser.driver.get(params.job_link)
+```
 
-# Initialize browser agent
-agent1 = Agent(
-    task="Open 3 VCs websites in the New York area.",
-    llm=ChatAnthropic(model="claude-3-5-sonnet-20240620", timeout=25, stop=None),
-    controller=controller)
-agent2 = Agent(
-    task="Give me the names of the founders of the companies in all tabs.",
-    llm=ChatAnthropic(model="claude-3-5-sonnet-20240620", timeout=25, stop=None),
-    controller=controller)
+and then run your agent:
 
-run(agent1.run())
-founders, history = run(agent2.run())
+```python
+model = ChatAnthropic(model_name='claude-3-5-sonnet-20240620', timeout=25, stop=None, temperature=0.3)
+agent = Agent(task=task, llm=model, controller=controller)
 
-print(founders)
+await agent.run()
 ```
 
-You can use the `history` to run the agents again deterministically.
+## Get XPath history
 
-## Command Line Usage
+To get the entire history of everything the agent has done, you can use the output of the `run` method:
 
-Run examples directly from the command line (clone the repo first):
+```python
+history: list[AgentHistory] = await agent.run()
 
-```bash
-python examples/try.py "Your query here" --provider [openai|anthropic]
+print(history)
 ```
 
-### Anthropic
+## More examples
 
-You need to add `ANTHROPIC_API_KEY` to your environment variables. Example usage:
+For more examples see the [examples](examples) folder or join the [Discord](https://link.browser-use.com/discord) and show off your project.
 
-```bash
+# Contributing
 
-python examples/try.py "Search the top 3 AI companies 2024 and find out in 3 new tabs what hardware each is using for their models" --provider anthropic
-```
+Contributions are welcome! Feel free to open issues for bugs or feature requests.
 
-### OpenAI
+## Setup
 
-You need to add `OPENAI_API_KEY` to your environment variables. Example usage:
+1. Create a virtual environment and install dependencies:
 
 ```bash
-python examples/try.py "Go to hackernews on show hn and give me top 10 post titles, their points and hours. Calculate for each the ratio of points per hour. " --provider anthropic
+# To install all dependencies including dev
+pip install -r requirements.txt -r requirements-dev.txt
 ```
 
-## 🤖 Supported Models
-
-All LangChain chat models are supported. Tested with:
-
-- GPT-4o
-- GPT-4o Mini
-- Claude 3.5 Sonnet
-- LLama 3.1 405B
+2. Add your API keys to the `.env` file:
 
-## Limitations
+```bash
+cp .env.example .env
+```
 
-- When extracting page content, the message length increases and the LLM gets slower.
-- Currently one agent costs about 0.01$
-- Sometimes it tries to repeat the same task over and over again.
-- Some elements might not be extracted which you want to interact with.
-- What should we focus on the most?
-  - Robustness
-  - Speed
-  - Cost reduction
+or copy the following to your `.env` file:
 
-## Roadmap
+```bash
+OPENAI_API_KEY=
+ANTHROPIC_API_KEY=
+```
 
-- [x] Save agent actions and execute them deterministically
-- [ ] Pydantic forced output
-- [ ] Third party SERP API for faster Google Search results
-- [ ] Multi-step action execution to increase speed
-- [ ] Test on mind2web dataset
-- [ ] Add more browser actions
+You can use any LLM model supported by LangChain by adding the appropriate environment variables. See [langchain models](https://python.langchain.com/docs/integrations/chat/) for available options.
 
-## Contributing
+### Building the package
 
-Contributions are welcome! Feel free to open issues for bugs or feature requests.
+```bash
+hatch build
+```
 
-Feel free to join the [Discord](https://discord.gg/uaCtrbbv) for discussions and support.
+Feel free to join the [Discord](https://link.browser-use.com/discord) for discussions and support.
 
 ---
 
 
@@ -1,6 +1,10 @@
-from browser_use.agent.service import AgentService as Agent
-from browser_use.browser.service import BrowserService as Browser
-from browser_use.controller.service import ControllerService as Controller
+from browser_use.logging_config import setup_logging
+
+setup_logging()
+
+from browser_use.agent.service import Agent as Agent
+from browser_use.browser.service import Browser as Browser
+from browser_use.controller.service import Controller as Controller
 from browser_use.dom.service import DomService
 
 __all__ = ['Agent', 'Browser', 'Controller', 'DomService']