源码级别解析 · 源码解析 · 2026
2026-04-23 | 每日技术深度解读
让AI能够像人类一样操作浏览器
from browser_use import Agent, Browser, ChatBrowserUse
import asyncio
async def main():
browser = Browser()
agent = Agent(
task="Find the number of stars of the browser-use repo",
llm=ChatBrowserUse(),
browser=browser,
)
await agent.run()
if __name__ == "__main__":
asyncio.run(main())
三行代码即可启动AI浏览器自动化任务
Agent通过LLM理解任务,通过Browser执行动作
class Agent:
def __init__(
self,
task: str,
llm: Union[ChatBrowserUse, ChatGoogle, ChatAnthropic],
browser: Browser,
tools: Optional[Tools] = None,
memory: Optional[MemoryManager] = None,
max_steps: int = 50,
enable_code_execution: bool = False
)
Agent类支持灵活的配置选项
async def run(self) -> AgentOutput:
"""执行任务的主循环"""
current_state = await self.browser.get_state()
for step in range(self.max_steps):
# 1. 获取当前浏览器状态
state = await self.browser.get_state()
# 2. LLM推理
response = await self.llm.generate(
task=self.task,
state=state,
memory=self.memory.get_context()
)
# 3. 执行动作
result = await self._execute_actions(response.actions)
# 4. 记忆更新
self.memory.update(response, result)
循环执行直到任务完成或达到最大步数
class BaseLLM:
def __init__(self, model: str, **kwargs):
self.model = model
self.client = self._init_client()
async def generate(
self,
task: str,
state: dict,
memory: str,
**kwargs
) -> LLMResponse:
"""生成响应的抽象方法"""
prompt = self._build_prompt(task, state, memory)
response = await self._call_api(prompt)
return self._parse_response(response)
采用策略模式支持不同LLM提供商
class ChatBrowserUse(BaseLLM):
"""专为浏览器自动化优化的Chat模型"""
def _build_prompt(self, task: str, state: dict, memory: str) -> str:
"""构建包含浏览器状态的提示词"""
prompt = f"""
Task: {task}
Current Browser State:
{self._format_browser_state(state)}
Memory:
{memory}
Available Actions:
- click(element_index)
- type(text)
- navigate(url)
- scroll(direction)
"""
return prompt
提示词包含浏览器状态和可用动作信息
class Browser:
def __init__(
self,
headless: bool = True,
use_cloud: bool = False,
cloud_config: Optional[CloudConfig] = None,
proxy: Optional[str] = None,
user_agent: Optional[str] = None
):
self.playwright = None
self.page = None
self.context = None
self.use_cloud = use_cloud
支持本地和云浏览器两种模式
async def get_state(self) -> dict:
"""获取当前浏览器状态"""
state = {
'url': self.page.url if self.page else None,
'title': await self.page.title() if self.page else None,
'elements': await self._get_interactive_elements(),
'screenshot': await self.page.screenshot() if self.page else None
}
return state
返回结构化的浏览器状态信息
async def _get_interactive_elements(self) -> list:
"""获取可交互元素列表"""
elements = []
# 获取所有可点击元素
clickable = await self.page.query_selector_all('button, a, input[type="submit"]')
# 获取所有可输入元素
inputable = await self.page.query_selector_all('input[type="text"], textarea')
for i, element in enumerate(clickable + inputable):
try:
text = await element.inner_text()
tag = await element.tag_name()
elements.append({
'index': i,
'tag': tag,
'text': text,
'visible': await element.is_visible()
})
except:
continue
return elements
智能识别可交互元素并分配索引
async def click(self, element_index: int) -> dict:
"""点击指定索引的元素"""
elements = await self._get_interactive_elements()
if element_index >= len(elements):
raise ValueError(f"Element index {element_index} out of range")
element = elements[element_index]
try:
await element.click(timeout=5000)
return {'success': True, 'element': element}
except Exception as e:
# 重试机制
await asyncio.sleep(1)
await element.click(timeout=5000)
return {'success': True, 'element': element}
内置重试机制提高成功率
from browser_use import Tools
tools = Tools()
@tools.action(description="获取网页标题")
def get_page_title() -> str:
"""返回当前页面的标题"""
return "Current Page Title"
@tools.action(description="搜索信息")
def search_web(query: str, max_results: int = 10) -> list:
"""在网络上搜索信息"""
# 实现搜索逻辑
return ["result1", "result2"]
使用装饰器轻松定义新工具
async def execute_tool(self, tool_name: str, params: dict) -> dict:
"""执行指定的工具"""
if tool_name not in self.tools:
raise ValueError(f"Tool {tool_name} not found")
tool = self.tools[tool_name]
# 参数验证
validated_params = self._validate_params(tool, params)
# 执行工具
if asyncio.iscoroutinefunction(tool.function):
result = await tool.function(**validated_params)
else:
result = tool.function(**validated_params)
return {
'tool': tool_name,
'result': result,
'success': True
}
支持同步和异步工具执行
class MemoryManager:
def __init__(self, max_memory_size: int = 10000):
self.short_term = []
self.long_term = {}
self.max_size = max_memory_size
def update(self, response: LLMResponse, result: dict):
"""更新记忆"""
memory_entry = {
'timestamp': datetime.now(),
'response': response,
'result': result
}
self.short_term.append(memory_entry)
# 压缩短期记忆
if len(self.short_term) > self.max_size:
self._compress_memory()
def get_context(self) -> str:
"""获取记忆上下文"""
return "\n".join([
f"{entry['timestamp']}: {entry['response'].task} -> {entry['result']}"
for entry in self.short_term[-10:] # 最近10条
])
智能管理记忆避免上下文过长
# 打开网页
browser-use open https://github.com
# 查看可点击元素
browser-use state
# 点击元素
browser-use click 5
# 输入文本
browser-use type "Hello World"
# 截图
browser-use screenshot page.png
# 关闭浏览器
browser-use close
简单直观的命令行界面
import click
from browser_use.skill_cli import BrowserUseCLI
@click.group()
def cli():
"""Browser-Use CLI工具"""
pass
@cli.command()
@click.argument('url')
def open(url):
"""打开指定URL"""
cli = BrowserUseCLI()
cli.open(url)
@cli.command()
def state():
"""显示当前浏览器状态"""
cli = BrowserUseCLI()
state = cli.get_state()
print(json.dumps(state, indent=2))
模块化设计便于扩展
# 配置云浏览器
browser = Browser(
use_cloud=True,
cloud_config=CloudConfig(
api_key="your-api-key",
region="us-west-1",
stealth=True,
proxy_rotation=True
)
)
# 使用云API
from browser_use import CloudBrowserUse
llm = CloudBrowserUse(
model="browser-use/bu-30b-a3b-preview",
use_stealth=True,
use_proxy_rotation=True
)
云服务提供额外的隐私和性能特性
class CacheManager:
def __init__(self, max_size: int = 1000):
self.cache = {}
self.max_size = max_size
def get(self, key: str) -> Optional[dict]:
if key in self.cache:
entry = self.cache[key]
if not self._is_expired(entry):
return entry['value']
else:
del self.cache[key]
return None
def set(self, key: str, value: dict, ttl: int = 3600):
"""设置缓存值,TTL单位为秒"""
self.cache[key] = {
'value': value,
'created_at': time.time(),
'ttl': ttl
}
# 清理过期缓存
self._cleanup_expired()
TTL缓存提高重复操作性能
async def safe_execute(self, func, *args, max_retries: int = 3, **kwargs):
"""安全执行带重试的函数"""
last_error = None
for attempt in range(max_retries):
try:
result = await func(*args, **kwargs)
return result
except NetworkError as e:
last_error = e
if attempt < max_retries - 1:
await asyncio.sleep(2 ** attempt) # 指数退避
continue
except ElementNotFound as e:
last_error = e
# 尝试滚动页面
await self.scroll('down')
await asyncio.sleep(1)
continue
raise last_error or MaxRetriesExceededError()
智能重试和错误恢复机制
from dataclasses import dataclass
from typing import Optional
@dataclass
class BrowserConfig:
headless: bool = True
timeout: int = 30000
viewport: dict = None
user_agent: Optional[str] = None
proxy: Optional[str] = None
@dataclass
class AgentConfig:
max_steps: int = 50
enable_code_execution: bool = False
retry_attempts: int = 3
memory_size: int = 10000
使用dataclass提供类型安全的配置
import pytest
from browser_use import Agent, Browser, ChatBrowserUse
@pytest.mark.asyncio
async def test_agent_navigation():
"""测试浏览器导航功能"""
browser = Browser(headless=True)
agent = Agent(
task="Navigate to GitHub and verify title",
llm=ChatBrowserUse(),
browser=browser
)
result = await agent.run()
assert result.success
assert "GitHub" in result.final_state['title']
使用pytest进行异步测试
async def buy_groceries():
"""购买杂货的自动化流程"""
browser = Browser(use_cloud=True)
agent = Agent(
task="Buy groceries from online supermarket",
llm=ChatBrowserUse(),
browser=browser,
tools=grocery_tools
)
result = await agent.run()
return result.order_id
完整的购物流程自动化
import prometheus_client
from prometheus_client import Counter, Histogram
# 定义监控指标
REQUEST_COUNT = Counter(
'browser_use_requests_total',
'Total number of browser requests'
)
REQUEST_DURATION = Histogram(
'browser_use_request_duration_seconds',
'Duration of browser requests'
)
REQUEST_ERRORS = Counter(
'browser_use_errors_total',
'Total number of errors'
)
集成Prometheus监控系统
感谢阅读!
访问 https://atcfu.com/ai-articles/browser-use-ai-browser-automation/ 回顾本文