diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..b3a9d10 --- /dev/null +++ b/.env.example @@ -0,0 +1,10 @@ +# Device +DEVICE_SERIAL= # leave empty for auto-detect + +# VLM Provider: poe / openrouter / local +VLM_PROVIDER=poe +VLM_MODEL=Qwen/Qwen2.5-VL-7B-Instruct + +# API Keys (fill the one matching your provider) +POE_API_KEY= +OPENROUTER_API_KEY= diff --git a/.gitignore b/.gitignore index dd31b51..21a5631 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,7 @@ __pycache__/ .vscode/ .idea/ *.log +data/screenshots/ +*.egg-info/ +.venv/ +venv/ diff --git a/.memory/project-status.md b/.memory/project-status.md new file mode 100644 index 0000000..19a24f7 --- /dev/null +++ b/.memory/project-status.md @@ -0,0 +1,77 @@ +--- +name: GUI Agent 项目状态 +description: 手机GUI Agent项目当前进度、技术决策和待确认事项 +type: project +--- + +## 项目状态:端到端已跑通 + 手机端 OCR 已部署 + +### 设备信息 +- **华为 P40 Pro**(ELS-AN00) +- 序列号:UQG5T20416000119 +- 分辨率:1200x2640 +- 系统:HarmonyOS 4.x(兼容安卓层,ADB 可用) +- ADB 路径:`/opt/homebrew/bin/adb` +- 连接注意:华为手机需在开发者选项中额外打开"仅充电模式下允许ADB调试" +- **已开启「通过USB安装应用」权限**(2026-03-29) + +### 已完成 +- 七层管线骨架代码(L1-L7)全部就位 +- Web 控制台(FastAPI + 暗色主题 UI)已验证可运行 +- 端口 4380,VLM 默认走 Poe API +- 支持 8 种动作类型(tap/swipe/type/long_press/back/home/scroll/wait) +- Agent 主循环含历史记忆(最近 5 步)和连续错误自动停止 +- **ADB 截屏已验证通过**(2026-03-29) +- **Mac 端 OCR 元素定位已验证**(2026-03-29)— easyocr 中文识别,返回像素坐标 +- **中文文本输入已验证**(2026-03-29)— uiautomator2 send_keys +- **端到端发微信消息已跑通 3 次**(2026-03-29)— "你是大聪明"、"祝你生日快乐"、"生日快乐" +- **手机端 OCR Service APK 已部署**(2026-03-29)— ML Kit Chinese bundled,端口 18900 + +### 手机端 OCR Service(android-ocr-service/) +- **引擎**:Google ML Kit text-recognition-chinese(bundled 版,不依赖 GMS,华为可用) +- **架构**:Kotlin APK = OcrEngine + NanoHTTPD(18900) + ForegroundService +- **接口**: + - `GET /health` — 健康检查 + - `GET /ocr?path=/data/local/tmp/s.png` — 读文件 OCR + - `GET /ocr?path=...&text=微信` — 按文本过滤 + - `POST /snap` — POST 图片字节直接 OCR(NanoHTTPD 二进制处理有 bug,待修) +- **使用流程**: + ```bash + adb shell am start -n com.guiagent.ocr/.MainActivity + adb forward tcp:18900 tcp:18900 + adb shell "screencap -p /data/local/tmp/s.png" + curl http://localhost:18900/ocr?path=/data/local/tmp/s.png + ``` +- **性能**:首次 ~2.4s(模型加载),后续 ~1.8s/次 +- **构建**:`ANDROID_HOME=/opt/homebrew/share/android-commandlinetools JAVA_HOME=/opt/homebrew/Cellar/openjdk@21/21.0.10/libexec/openjdk.jdk/Contents/Home ./gradlew assembleDebug` + +### 关键技术决策 +| 能力 | 方案 | 备注 | +|------|------|------| +| 元素定位(Mac) | easyocr | pytesseract 中文分词差,uiautomator dump 在华为微信上返回空 | +| 元素定位(手机端) | ML Kit Chinese (bundled) | 不依赖 GMS/HMS,APK 自带模型 | +| 中文输入 | uiautomator2 send_keys | 需装辅助 APK,华为需开 USB 安装权限 | +| 截屏 | `adb shell screencap -p /data/local/tmp/s.png` | 不经 FUSE,比 /sdcard/ 快 | +| adb input text | 不支持中文 | NullPointerException,clipboard 也不可用 | +| 截屏显示 | 必须 sips -Z 1800 缩小 | 原始 1200x2640 超 Claude 2000px 限制 | + +### 已知问题 +1. OCR 偶尔误读("康"→"東")— ML Kit 和 easyocr 都有此问题 +2. POST /snap 端点 NanoHTTPD 二进制 body 解析 bug — 文件方式 workaround +3. 微信双开弹选择框 — 每次 am start 会弹"使用以下方式打开" +4. 发送按钮白字绿底 OCR 不稳定 — 用坐标 (1008, 2425) 或 OCR "(田发送" + +### 下一步(周一继续) +1. **速度优化**:发送按钮固定坐标不走 OCR(省2s),缩短 sleep(省2s),目标 5-6s/操作 +2. **OCR 推理优化**:缩图再识别 / NNAPI 加速,目标 <1s +3. **集成到 Agent 主循环**:device OCR 引擎接入 ocr_grounding.py +4. 配置 .env(Poe API Key) +5. 接入 VLM(Poe API 调 Qwen2.5-VL)— 复杂场景屏幕理解 +6. 端到端跑通复杂多步任务(滑动、长按、跨 App) +7. 完善验证纠错层 + +### 技术背景 +项目灵感来自对字节 UI-TARS / 豆包手机的深度调研。结论: +- UI-TARS 开源的是权重+推理壳,训练代码和系统级操控完全闭源 +- 核心壁垒不是模型,是"截屏→理解→定位→规划→执行→验证"的全链路 +- 本项目目标:用开源 VLM + ADB 复现这个全链路 diff --git a/RULES.md b/RULES.md index 5fca6dc..3acd2a4 100644 --- a/RULES.md +++ b/RULES.md @@ -1,17 +1,43 @@ # 手机 GUI Agent 自动操控 -## 启动 -- `待补充` — 端口 4380 +## 架构 -## 部署 -- 平台:待定 -- 域名:待定 +七层管线闭环:截屏 → 理解 → 定位 → 规划 → 执行 → 验证 → 循环 + +``` +src/ +├── capture/ # L1 - ADB/scrcpy 截屏 +├── vision/ # L2 - VLM 屏幕理解 +├── grounding/ # L3 - 元素定位(自然语言→坐标) +├── planner/ # L4 - 任务规划与分解 +├── executor/ # L5 - ADB 动作执行 +└── verifier/ # L6+L7 - 验证纠错 + 状态记忆 +``` + +## 启动 + +- `python -m src.main` — 主服务,端口 4380 +- `python scripts/test_device.py` — 测试 ADB 连接 + +## 技术栈 + +- Python 3.11+ +- ADB + scrcpy(截屏与操控) +- Qwen2.5-VL / UI-TARS-1.5(视觉理解) +- FastAPI(Web 控制台) +- Poe API / OpenRouter(LLM 调用,按用户偏好) ## 环境变量 -- 待补充 + +- `DEVICE_SERIAL` — Android 设备序列号(adb devices 查看) +- `VLM_PROVIDER` — vlm 提供者:`local` / `poe` / `openrouter` +- `VLM_MODEL` — 模型名,默认 `Qwen/Qwen2.5-VL-7B-Instruct` +- `POE_API_KEY` — Poe API Key(VLM_PROVIDER=poe 时必填) +- `OPENROUTER_API_KEY` — OpenRouter Key(备用) ## 规则 -- 待补充 -## 注意事项 -- 待补充 +- 截屏用 adb exec-out screencap,不用 scrcpy 录屏流(省资源) +- 动作执行后必须等待 + 重新截屏验证 +- 所有截屏保存到 `data/screenshots/` 供调试 +- 坐标系统统一为百分比 (0-1),执行时再转设备像素 diff --git a/android-ocr-service/.gradle/8.5/checksums/checksums.lock b/android-ocr-service/.gradle/8.5/checksums/checksums.lock new file mode 100644 index 0000000..16a1cc8 Binary files /dev/null and b/android-ocr-service/.gradle/8.5/checksums/checksums.lock differ diff --git a/android-ocr-service/.gradle/8.5/checksums/md5-checksums.bin b/android-ocr-service/.gradle/8.5/checksums/md5-checksums.bin new file mode 100644 index 0000000..cdaea17 Binary files /dev/null and b/android-ocr-service/.gradle/8.5/checksums/md5-checksums.bin differ diff --git a/android-ocr-service/.gradle/8.5/checksums/sha1-checksums.bin b/android-ocr-service/.gradle/8.5/checksums/sha1-checksums.bin new file mode 100644 index 0000000..dc587a1 Binary files /dev/null and b/android-ocr-service/.gradle/8.5/checksums/sha1-checksums.bin differ diff --git a/android-ocr-service/.gradle/8.5/dependencies-accessors/dependencies-accessors.lock b/android-ocr-service/.gradle/8.5/dependencies-accessors/dependencies-accessors.lock new file mode 100644 index 0000000..af9bf9c Binary files /dev/null and b/android-ocr-service/.gradle/8.5/dependencies-accessors/dependencies-accessors.lock differ diff --git a/android-ocr-service/.gradle/8.5/dependencies-accessors/gc.properties b/android-ocr-service/.gradle/8.5/dependencies-accessors/gc.properties new file mode 100644 index 0000000..e69de29 diff --git a/android-ocr-service/.gradle/8.5/fileChanges/last-build.bin b/android-ocr-service/.gradle/8.5/fileChanges/last-build.bin new file mode 100644 index 0000000..f76dd23 Binary files /dev/null and b/android-ocr-service/.gradle/8.5/fileChanges/last-build.bin differ diff --git a/android-ocr-service/.gradle/8.5/fileHashes/fileHashes.lock b/android-ocr-service/.gradle/8.5/fileHashes/fileHashes.lock new file mode 100644 index 0000000..4010d39 Binary files /dev/null and b/android-ocr-service/.gradle/8.5/fileHashes/fileHashes.lock differ diff --git a/android-ocr-service/.gradle/8.5/gc.properties b/android-ocr-service/.gradle/8.5/gc.properties new file mode 100644 index 0000000..e69de29 diff --git a/android-ocr-service/.gradle/8.7/checksums/checksums.lock b/android-ocr-service/.gradle/8.7/checksums/checksums.lock new file mode 100644 index 0000000..9c43c0c Binary files /dev/null and b/android-ocr-service/.gradle/8.7/checksums/checksums.lock differ diff --git a/android-ocr-service/.gradle/8.7/checksums/md5-checksums.bin b/android-ocr-service/.gradle/8.7/checksums/md5-checksums.bin new file mode 100644 index 0000000..5cb020e Binary files /dev/null and b/android-ocr-service/.gradle/8.7/checksums/md5-checksums.bin differ diff --git a/android-ocr-service/.gradle/8.7/checksums/sha1-checksums.bin b/android-ocr-service/.gradle/8.7/checksums/sha1-checksums.bin new file mode 100644 index 0000000..e5d9d85 Binary files /dev/null and b/android-ocr-service/.gradle/8.7/checksums/sha1-checksums.bin differ diff --git a/android-ocr-service/.gradle/8.7/dependencies-accessors/gc.properties b/android-ocr-service/.gradle/8.7/dependencies-accessors/gc.properties new file mode 100644 index 0000000..e69de29 diff --git a/android-ocr-service/.gradle/8.7/executionHistory/executionHistory.bin b/android-ocr-service/.gradle/8.7/executionHistory/executionHistory.bin new file mode 100644 index 0000000..f603f34 Binary files /dev/null and b/android-ocr-service/.gradle/8.7/executionHistory/executionHistory.bin differ diff --git a/android-ocr-service/.gradle/8.7/executionHistory/executionHistory.lock b/android-ocr-service/.gradle/8.7/executionHistory/executionHistory.lock new file mode 100644 index 0000000..310b0b0 Binary files /dev/null and b/android-ocr-service/.gradle/8.7/executionHistory/executionHistory.lock differ diff --git a/android-ocr-service/.gradle/8.7/fileChanges/last-build.bin b/android-ocr-service/.gradle/8.7/fileChanges/last-build.bin new file mode 100644 index 0000000..f76dd23 Binary files /dev/null and b/android-ocr-service/.gradle/8.7/fileChanges/last-build.bin differ diff --git a/android-ocr-service/.gradle/8.7/fileHashes/fileHashes.bin b/android-ocr-service/.gradle/8.7/fileHashes/fileHashes.bin new file mode 100644 index 0000000..b02e58e Binary files /dev/null and b/android-ocr-service/.gradle/8.7/fileHashes/fileHashes.bin differ diff --git a/android-ocr-service/.gradle/8.7/fileHashes/fileHashes.lock b/android-ocr-service/.gradle/8.7/fileHashes/fileHashes.lock new file mode 100644 index 0000000..98bb767 Binary files /dev/null and b/android-ocr-service/.gradle/8.7/fileHashes/fileHashes.lock differ diff --git a/android-ocr-service/.gradle/8.7/fileHashes/resourceHashesCache.bin b/android-ocr-service/.gradle/8.7/fileHashes/resourceHashesCache.bin new file mode 100644 index 0000000..d4f4b8f Binary files /dev/null and b/android-ocr-service/.gradle/8.7/fileHashes/resourceHashesCache.bin differ diff --git a/android-ocr-service/.gradle/8.7/gc.properties b/android-ocr-service/.gradle/8.7/gc.properties new file mode 100644 index 0000000..e69de29 diff --git a/android-ocr-service/.gradle/9.4.1/checksums/checksums.lock b/android-ocr-service/.gradle/9.4.1/checksums/checksums.lock new file mode 100644 index 0000000..3f9fa26 Binary files /dev/null and b/android-ocr-service/.gradle/9.4.1/checksums/checksums.lock differ diff --git a/android-ocr-service/.gradle/9.4.1/checksums/md5-checksums.bin b/android-ocr-service/.gradle/9.4.1/checksums/md5-checksums.bin new file mode 100644 index 0000000..84b5680 Binary files /dev/null and b/android-ocr-service/.gradle/9.4.1/checksums/md5-checksums.bin differ diff --git a/android-ocr-service/.gradle/9.4.1/checksums/sha1-checksums.bin b/android-ocr-service/.gradle/9.4.1/checksums/sha1-checksums.bin new file mode 100644 index 0000000..043e69f Binary files /dev/null and b/android-ocr-service/.gradle/9.4.1/checksums/sha1-checksums.bin differ diff --git a/android-ocr-service/.gradle/9.4.1/executionHistory/executionHistory.bin b/android-ocr-service/.gradle/9.4.1/executionHistory/executionHistory.bin new file mode 100644 index 0000000..9076fc7 Binary files /dev/null and b/android-ocr-service/.gradle/9.4.1/executionHistory/executionHistory.bin differ diff --git a/android-ocr-service/.gradle/9.4.1/executionHistory/executionHistory.lock b/android-ocr-service/.gradle/9.4.1/executionHistory/executionHistory.lock new file mode 100644 index 0000000..5f536ce Binary files /dev/null and b/android-ocr-service/.gradle/9.4.1/executionHistory/executionHistory.lock differ diff --git a/android-ocr-service/.gradle/9.4.1/fileChanges/last-build.bin b/android-ocr-service/.gradle/9.4.1/fileChanges/last-build.bin new file mode 100644 index 0000000..f76dd23 Binary files /dev/null and b/android-ocr-service/.gradle/9.4.1/fileChanges/last-build.bin differ diff --git a/android-ocr-service/.gradle/9.4.1/fileHashes/fileHashes.bin b/android-ocr-service/.gradle/9.4.1/fileHashes/fileHashes.bin new file mode 100644 index 0000000..c315122 Binary files /dev/null and b/android-ocr-service/.gradle/9.4.1/fileHashes/fileHashes.bin differ diff --git a/android-ocr-service/.gradle/9.4.1/fileHashes/fileHashes.lock b/android-ocr-service/.gradle/9.4.1/fileHashes/fileHashes.lock new file mode 100644 index 0000000..cdc6c3c Binary files /dev/null and b/android-ocr-service/.gradle/9.4.1/fileHashes/fileHashes.lock differ diff --git a/android-ocr-service/.gradle/9.4.1/gc.properties b/android-ocr-service/.gradle/9.4.1/gc.properties new file mode 100644 index 0000000..e69de29 diff --git a/android-ocr-service/.gradle/buildOutputCleanup/buildOutputCleanup.lock b/android-ocr-service/.gradle/buildOutputCleanup/buildOutputCleanup.lock new file mode 100644 index 0000000..7216bfd Binary files /dev/null and b/android-ocr-service/.gradle/buildOutputCleanup/buildOutputCleanup.lock differ diff --git a/android-ocr-service/.gradle/buildOutputCleanup/cache.properties b/android-ocr-service/.gradle/buildOutputCleanup/cache.properties new file mode 100644 index 0000000..80aa2dd --- /dev/null +++ b/android-ocr-service/.gradle/buildOutputCleanup/cache.properties @@ -0,0 +1,2 @@ +#Sun Mar 29 02:14:23 CST 2026 +gradle.version=8.7 diff --git a/android-ocr-service/.gradle/buildOutputCleanup/outputFiles.bin b/android-ocr-service/.gradle/buildOutputCleanup/outputFiles.bin new file mode 100644 index 0000000..cdd335b Binary files /dev/null and b/android-ocr-service/.gradle/buildOutputCleanup/outputFiles.bin differ diff --git a/android-ocr-service/.gradle/file-system.probe b/android-ocr-service/.gradle/file-system.probe new file mode 100644 index 0000000..f780a7d Binary files /dev/null and b/android-ocr-service/.gradle/file-system.probe differ diff --git a/android-ocr-service/.gradle/vcs-1/gc.properties b/android-ocr-service/.gradle/vcs-1/gc.properties new file mode 100644 index 0000000..e69de29 diff --git a/android-ocr-service/app/build.gradle.kts b/android-ocr-service/app/build.gradle.kts new file mode 100644 index 0000000..d2dd9c4 --- /dev/null +++ b/android-ocr-service/app/build.gradle.kts @@ -0,0 +1,43 @@ +plugins { + id("com.android.application") + id("org.jetbrains.kotlin.android") +} + +android { + namespace = "com.guiagent.ocr" + compileSdk = 31 + + defaultConfig { + applicationId = "com.guiagent.ocr" + minSdk = 26 + targetSdk = 31 + versionCode = 1 + versionName = "1.0" + } + + buildTypes { + release { + isMinifyEnabled = false + } + } + + compileOptions { + sourceCompatibility = JavaVersion.VERSION_1_8 + targetCompatibility = JavaVersion.VERSION_1_8 + } + + kotlinOptions { + jvmTarget = "1.8" + } +} + +dependencies { + // ML Kit Text Recognition - bundled model (no GMS needed!) + implementation("com.google.mlkit:text-recognition-chinese:16.0.0") + + // HTTP server + implementation("org.nanohttpd:nanohttpd:2.3.1") + + // JSON + implementation("com.google.code.gson:gson:2.10.1") +} diff --git a/android-ocr-service/app/src/main/AndroidManifest.xml b/android-ocr-service/app/src/main/AndroidManifest.xml new file mode 100644 index 0000000..5a231f9 --- /dev/null +++ b/android-ocr-service/app/src/main/AndroidManifest.xml @@ -0,0 +1,28 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/android-ocr-service/app/src/main/java/com/guiagent/ocr/MainActivity.kt b/android-ocr-service/app/src/main/java/com/guiagent/ocr/MainActivity.kt new file mode 100644 index 0000000..f0a0c32 --- /dev/null +++ b/android-ocr-service/app/src/main/java/com/guiagent/ocr/MainActivity.kt @@ -0,0 +1,23 @@ +package com.guiagent.ocr + +import android.app.Activity +import android.content.Intent +import android.os.Bundle +import android.widget.TextView + +class MainActivity : Activity() { + override fun onCreate(savedInstanceState: Bundle?) { + super.onCreate(savedInstanceState) + val tv = TextView(this).apply { + text = "OCR Service\nPort: 18900\nStarting..." + textSize = 20f + setPadding(40, 40, 40, 40) + } + setContentView(tv) + + // Start the service + val intent = Intent(this, OcrService::class.java) + startForegroundService(intent) + tv.text = "OCR Service\nPort: 18900\nRunning!" + } +} diff --git a/android-ocr-service/app/src/main/java/com/guiagent/ocr/OcrEngine.kt b/android-ocr-service/app/src/main/java/com/guiagent/ocr/OcrEngine.kt new file mode 100644 index 0000000..09ec703 --- /dev/null +++ b/android-ocr-service/app/src/main/java/com/guiagent/ocr/OcrEngine.kt @@ -0,0 +1,79 @@ +package com.guiagent.ocr + +import android.graphics.Bitmap +import android.graphics.BitmapFactory +import com.google.mlkit.vision.common.InputImage +import com.google.mlkit.vision.text.TextRecognition +import com.google.mlkit.vision.text.chinese.ChineseTextRecognizerOptions +import java.io.File +import java.util.concurrent.CountDownLatch +import java.util.concurrent.TimeUnit + +data class TextBox( + val text: String, + val x: Int, + val y: Int, + val w: Int, + val h: Int, + val confidence: Float +) { + val cx get() = x + w / 2 + val cy get() = y + h / 2 +} + +object OcrEngine { + + private val recognizer by lazy { + TextRecognition.getClient(ChineseTextRecognizerOptions.Builder().build()) + } + + fun recognize(imagePath: String): List { + val file = File(imagePath) + if (!file.exists()) return emptyList() + val bitmap = BitmapFactory.decodeFile(imagePath) ?: return emptyList() + return recognizeBitmap(bitmap) + } + + /** 直接截屏并识别,不落盘 */ + fun screencapAndRecognize(): List { + val process = Runtime.getRuntime().exec("screencap -p") + val bytes = process.inputStream.readBytes() + process.waitFor() + if (bytes.isEmpty()) return emptyList() + val bitmap = BitmapFactory.decodeByteArray(bytes, 0, bytes.size) ?: return emptyList() + return recognizeBitmap(bitmap) + } + + fun recognizeBitmap(bitmap: Bitmap): List { + val image = InputImage.fromBitmap(bitmap, 0) + val results = mutableListOf() + val latch = CountDownLatch(1) + + recognizer.process(image) + .addOnSuccessListener { visionText -> + for (block in visionText.textBlocks) { + for (line in block.lines) { + val box = line.boundingBox ?: continue + results.add( + TextBox( + text = line.text, + x = box.left, + y = box.top, + w = box.width(), + h = box.height(), + confidence = line.confidence ?: 0.8f + ) + ) + } + } + latch.countDown() + } + .addOnFailureListener { + latch.countDown() + } + + latch.await(10, TimeUnit.SECONDS) + bitmap.recycle() + return results + } +} diff --git a/android-ocr-service/app/src/main/java/com/guiagent/ocr/OcrHttpServer.kt b/android-ocr-service/app/src/main/java/com/guiagent/ocr/OcrHttpServer.kt new file mode 100644 index 0000000..2e16213 --- /dev/null +++ b/android-ocr-service/app/src/main/java/com/guiagent/ocr/OcrHttpServer.kt @@ -0,0 +1,88 @@ +package com.guiagent.ocr + +import android.graphics.BitmapFactory +import com.google.gson.Gson +import fi.iki.elonen.NanoHTTPD +import java.io.ByteArrayOutputStream + +class OcrHttpServer(port: Int = 18900) : NanoHTTPD(port) { + + private val gson = Gson() + private val defaultPath = "/sdcard/ocr_screen.png" + + override fun serve(session: IHTTPSession): Response { + return when (session.uri) { + "/ocr" -> handleOcr(session) + "/snap" -> handleSnap(session) + "/health" -> jsonResponse(mapOf("status" to "ok", "engine" to "mlkit-chinese")) + else -> newFixedLengthResponse(Response.Status.NOT_FOUND, MIME_PLAINTEXT, "404") + } + } + + /** 读文件方式 OCR */ + private fun handleOcr(session: IHTTPSession): Response { + val params = session.parms ?: emptyMap() + val imagePath = params["path"] ?: defaultPath + return doOcr(params["text"]) { OcrEngine.recognize(imagePath) } + } + + /** POST 图片数据直接 OCR,不存文件 */ + private fun handleSnap(session: IHTTPSession): Response { + val params = session.parms ?: emptyMap() + + if (session.method == Method.POST) { + // NanoHTTPD parseBody 将 binary data 存到临时文件 + val bodyFiles = HashMap() + session.parseBody(bodyFiles) + + // postData 键对应临时文件路径 + val tmpPath = bodyFiles["postData"] + if (tmpPath != null) { + val imageBytes = java.io.File(tmpPath).readBytes() + val bitmap = BitmapFactory.decodeByteArray(imageBytes, 0, imageBytes.size) + if (bitmap != null) { + return doOcr(params["text"]) { OcrEngine.recognizeBitmap(bitmap) } + } + return jsonResponse(mapOf("error" to "decode failed", "size" to imageBytes.size, "count" to 0)) + } + return jsonResponse(mapOf("error" to "no body received", "count" to 0)) + } + + // GET: 读文件方式 fallback + return handleOcr(session) + } + + private fun doOcr(query: String?, recognize: () -> List): Response { + val startTime = System.currentTimeMillis() + var results = recognize() + + if (!query.isNullOrBlank()) { + results = results.filter { it.text.contains(query) } + } + + val elapsed = System.currentTimeMillis() - startTime + + val response = mapOf( + "results" to results.map { box -> + mapOf( + "text" to box.text, + "x" to box.x, + "y" to box.y, + "w" to box.w, + "h" to box.h, + "cx" to box.cx, + "cy" to box.cy, + "confidence" to box.confidence + ) + }, + "count" to results.size, + "elapsed_ms" to elapsed + ) + return jsonResponse(response) + } + + private fun jsonResponse(data: Any): Response { + val json = gson.toJson(data) + return newFixedLengthResponse(Response.Status.OK, "application/json", json) + } +} diff --git a/android-ocr-service/app/src/main/java/com/guiagent/ocr/OcrService.kt b/android-ocr-service/app/src/main/java/com/guiagent/ocr/OcrService.kt new file mode 100644 index 0000000..01e84c1 --- /dev/null +++ b/android-ocr-service/app/src/main/java/com/guiagent/ocr/OcrService.kt @@ -0,0 +1,49 @@ +package com.guiagent.ocr + +import android.app.* +import android.content.Intent +import android.os.Build +import android.os.IBinder +import android.util.Log + +class OcrService : Service() { + + private var server: OcrHttpServer? = null + private val TAG = "OcrService" + private val PORT = 18900 + + override fun onStartCommand(intent: Intent?, flags: Int, startId: Int): Int { + startForegroundNotification() + + if (server == null) { + server = OcrHttpServer(PORT).also { + it.start() + Log.i(TAG, "OCR HTTP server started on port $PORT") + } + } + return START_STICKY + } + + override fun onDestroy() { + server?.stop() + server = null + Log.i(TAG, "OCR HTTP server stopped") + super.onDestroy() + } + + override fun onBind(intent: Intent?): IBinder? = null + + private fun startForegroundNotification() { + val channelId = "ocr_service" + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) { + val channel = NotificationChannel(channelId, "OCR Service", NotificationManager.IMPORTANCE_LOW) + getSystemService(NotificationManager::class.java).createNotificationChannel(channel) + } + val notification = Notification.Builder(this, channelId) + .setContentTitle("OCR Service") + .setContentText("Running on port $PORT") + .setSmallIcon(android.R.drawable.ic_menu_camera) + .build() + startForeground(1, notification) + } +} diff --git a/android-ocr-service/app/src/main/res/values/strings.xml b/android-ocr-service/app/src/main/res/values/strings.xml new file mode 100644 index 0000000..041eec0 --- /dev/null +++ b/android-ocr-service/app/src/main/res/values/strings.xml @@ -0,0 +1,4 @@ + + + OCR Service + diff --git a/android-ocr-service/build.gradle.kts b/android-ocr-service/build.gradle.kts new file mode 100644 index 0000000..2eb1b24 --- /dev/null +++ b/android-ocr-service/build.gradle.kts @@ -0,0 +1,4 @@ +plugins { + id("com.android.application") version "8.5.1" apply false + id("org.jetbrains.kotlin.android") version "2.0.0" apply false +} diff --git a/android-ocr-service/gradle.properties b/android-ocr-service/gradle.properties new file mode 100644 index 0000000..729b51a --- /dev/null +++ b/android-ocr-service/gradle.properties @@ -0,0 +1,3 @@ +org.gradle.jvmargs=-Xmx2048m +android.useAndroidX=true +kotlin.code.style=official diff --git a/android-ocr-service/gradle/wrapper/gradle-wrapper.jar b/android-ocr-service/gradle/wrapper/gradle-wrapper.jar new file mode 100644 index 0000000..d997cfc Binary files /dev/null and b/android-ocr-service/gradle/wrapper/gradle-wrapper.jar differ diff --git a/android-ocr-service/gradle/wrapper/gradle-wrapper.properties b/android-ocr-service/gradle/wrapper/gradle-wrapper.properties new file mode 100644 index 0000000..b82aa23 --- /dev/null +++ b/android-ocr-service/gradle/wrapper/gradle-wrapper.properties @@ -0,0 +1,7 @@ +distributionBase=GRADLE_USER_HOME +distributionPath=wrapper/dists +distributionUrl=https\://services.gradle.org/distributions/gradle-8.7-bin.zip +networkTimeout=10000 +validateDistributionUrl=true +zipStoreBase=GRADLE_USER_HOME +zipStorePath=wrapper/dists diff --git a/android-ocr-service/gradlew b/android-ocr-service/gradlew new file mode 100755 index 0000000..97de990 --- /dev/null +++ b/android-ocr-service/gradlew @@ -0,0 +1,249 @@ +#!/bin/sh + +# +# Copyright © 2015-2021 the original authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +############################################################################## +# +# Gradle start up script for POSIX generated by Gradle. +# +# Important for running: +# +# (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is +# noncompliant, but you have some other compliant shell such as ksh or +# bash, then to run this script, type that shell name before the whole +# command line, like: +# +# ksh Gradle +# +# Busybox and similar reduced shells will NOT work, because this script +# requires all of these POSIX shell features: +# * functions; +# * expansions «$var», «${var}», «${var:-default}», «${var+SET}», +# «${var#prefix}», «${var%suffix}», and «$( cmd )»; +# * compound commands having a testable exit status, especially «case»; +# * various built-in commands including «command», «set», and «ulimit». +# +# Important for patching: +# +# (2) This script targets any POSIX shell, so it avoids extensions provided +# by Bash, Ksh, etc; in particular arrays are avoided. +# +# The "traditional" practice of packing multiple parameters into a +# space-separated string is a well documented source of bugs and security +# problems, so this is (mostly) avoided, by progressively accumulating +# options in "$@", and eventually passing that to Java. +# +# Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS, +# and GRADLE_OPTS) rely on word-splitting, this is performed explicitly; +# see the in-line comments for details. +# +# There are tweaks for specific operating systems such as AIX, CygWin, +# Darwin, MinGW, and NonStop. +# +# (3) This script is generated from the Groovy template +# https://github.com/gradle/gradle/blob/HEAD/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt +# within the Gradle project. +# +# You can find Gradle at https://github.com/gradle/gradle/. +# +############################################################################## + +# Attempt to set APP_HOME + +# Resolve links: $0 may be a link +app_path=$0 + +# Need this for daisy-chained symlinks. +while + APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path + [ -h "$app_path" ] +do + ls=$( ls -ld "$app_path" ) + link=${ls#*' -> '} + case $link in #( + /*) app_path=$link ;; #( + *) app_path=$APP_HOME$link ;; + esac +done + +# This is normally unused +# shellcheck disable=SC2034 +APP_BASE_NAME=${0##*/} +# Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036) +APP_HOME=$( cd "${APP_HOME:-./}" > /dev/null && pwd -P ) || exit + +# Use the maximum available, or set MAX_FD != -1 to use that value. +MAX_FD=maximum + +warn () { + echo "$*" +} >&2 + +die () { + echo + echo "$*" + echo + exit 1 +} >&2 + +# OS specific support (must be 'true' or 'false'). +cygwin=false +msys=false +darwin=false +nonstop=false +case "$( uname )" in #( + CYGWIN* ) cygwin=true ;; #( + Darwin* ) darwin=true ;; #( + MSYS* | MINGW* ) msys=true ;; #( + NONSTOP* ) nonstop=true ;; +esac + +CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar + + +# Determine the Java command to use to start the JVM. +if [ -n "$JAVA_HOME" ] ; then + if [ -x "$JAVA_HOME/jre/sh/java" ] ; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD=$JAVA_HOME/jre/sh/java + else + JAVACMD=$JAVA_HOME/bin/java + fi + if [ ! -x "$JAVACMD" ] ; then + die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." + fi +else + JAVACMD=java + if ! command -v java >/dev/null 2>&1 + then + die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." + fi +fi + +# Increase the maximum file descriptors if we can. +if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then + case $MAX_FD in #( + max*) + # In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked. + # shellcheck disable=SC2039,SC3045 + MAX_FD=$( ulimit -H -n ) || + warn "Could not query maximum file descriptor limit" + esac + case $MAX_FD in #( + '' | soft) :;; #( + *) + # In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked. + # shellcheck disable=SC2039,SC3045 + ulimit -n "$MAX_FD" || + warn "Could not set maximum file descriptor limit to $MAX_FD" + esac +fi + +# Collect all arguments for the java command, stacking in reverse order: +# * args from the command line +# * the main class name +# * -classpath +# * -D...appname settings +# * --module-path (only if needed) +# * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables. + +# For Cygwin or MSYS, switch paths to Windows format before running java +if "$cygwin" || "$msys" ; then + APP_HOME=$( cygpath --path --mixed "$APP_HOME" ) + CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" ) + + JAVACMD=$( cygpath --unix "$JAVACMD" ) + + # Now convert the arguments - kludge to limit ourselves to /bin/sh + for arg do + if + case $arg in #( + -*) false ;; # don't mess with options #( + /?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath + [ -e "$t" ] ;; #( + *) false ;; + esac + then + arg=$( cygpath --path --ignore --mixed "$arg" ) + fi + # Roll the args list around exactly as many times as the number of + # args, so each arg winds up back in the position where it started, but + # possibly modified. + # + # NB: a `for` loop captures its iteration list before it begins, so + # changing the positional parameters here affects neither the number of + # iterations, nor the values presented in `arg`. + shift # remove old arg + set -- "$@" "$arg" # push replacement arg + done +fi + + +# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +DEFAULT_JVM_OPTS='-Dfile.encoding=UTF-8 "-Xmx64m" "-Xms64m"' + +# Collect all arguments for the java command: +# * DEFAULT_JVM_OPTS, JAVA_OPTS, JAVA_OPTS, and optsEnvironmentVar are not allowed to contain shell fragments, +# and any embedded shellness will be escaped. +# * For example: A user cannot expect ${Hostname} to be expanded, as it is an environment variable and will be +# treated as '${Hostname}' itself on the command line. + +set -- \ + "-Dorg.gradle.appname=$APP_BASE_NAME" \ + -classpath "$CLASSPATH" \ + org.gradle.wrapper.GradleWrapperMain \ + "$@" + +# Stop when "xargs" is not available. +if ! command -v xargs >/dev/null 2>&1 +then + die "xargs is not available" +fi + +# Use "xargs" to parse quoted args. +# +# With -n1 it outputs one arg per line, with the quotes and backslashes removed. +# +# In Bash we could simply go: +# +# readarray ARGS < <( xargs -n1 <<<"$var" ) && +# set -- "${ARGS[@]}" "$@" +# +# but POSIX shell has neither arrays nor command substitution, so instead we +# post-process each arg (as a line of input to sed) to backslash-escape any +# character that might be a shell metacharacter, then use eval to reverse +# that process (while maintaining the separation between arguments), and wrap +# the whole thing up as a single "set" statement. +# +# This will of course break if any of these variables contains a newline or +# an unmatched quote. +# + +eval "set -- $( + printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" | + xargs -n1 | + sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' | + tr '\n' ' ' + )" '"$@"' + +exec "$JAVACMD" "$@" diff --git a/android-ocr-service/gradlew.bat b/android-ocr-service/gradlew.bat new file mode 100644 index 0000000..ea603b4 --- /dev/null +++ b/android-ocr-service/gradlew.bat @@ -0,0 +1,92 @@ +@rem +@rem Copyright 2015 the original author or authors. +@rem +@rem Licensed under the Apache License, Version 2.0 (the "License"); +@rem you may not use this file except in compliance with the License. +@rem You may obtain a copy of the License at +@rem +@rem https://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, software +@rem distributed under the License is distributed on an "AS IS" BASIS, +@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@rem See the License for the specific language governing permissions and +@rem limitations under the License. +@rem + +@if "%DEBUG%"=="" @echo off +@rem ########################################################################## +@rem +@rem Gradle startup script for Windows +@rem +@rem ########################################################################## + +@rem Set local scope for the variables with windows NT shell +if "%OS%"=="Windows_NT" setlocal + +set DIRNAME=%~dp0 +if "%DIRNAME%"=="" set DIRNAME=. +@rem This is normally unused +set APP_BASE_NAME=%~n0 +set APP_HOME=%DIRNAME% + +@rem Resolve any "." and ".." in APP_HOME to make it shorter. +for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi + +@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +set DEFAULT_JVM_OPTS=-Dfile.encoding=UTF-8 "-Xmx64m" "-Xms64m" + +@rem Find java.exe +if defined JAVA_HOME goto findJavaFromJavaHome + +set JAVA_EXE=java.exe +%JAVA_EXE% -version >NUL 2>&1 +if %ERRORLEVEL% equ 0 goto execute + +echo. +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:findJavaFromJavaHome +set JAVA_HOME=%JAVA_HOME:"=% +set JAVA_EXE=%JAVA_HOME%/bin/java.exe + +if exist "%JAVA_EXE%" goto execute + +echo. +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:execute +@rem Setup the command line + +set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar + + +@rem Execute Gradle +"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* + +:end +@rem End local scope for the variables with windows NT shell +if %ERRORLEVEL% equ 0 goto mainEnd + +:fail +rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of +rem the _cmd.exe /c_ return code! +set EXIT_CODE=%ERRORLEVEL% +if %EXIT_CODE% equ 0 set EXIT_CODE=1 +if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE% +exit /b %EXIT_CODE% + +:mainEnd +if "%OS%"=="Windows_NT" endlocal + +:omega diff --git a/android-ocr-service/settings.gradle.kts b/android-ocr-service/settings.gradle.kts new file mode 100644 index 0000000..71ef23d --- /dev/null +++ b/android-ocr-service/settings.gradle.kts @@ -0,0 +1,18 @@ +pluginManagement { + repositories { + google() + mavenCentral() + gradlePluginPortal() + } +} + +dependencyResolutionManagement { + repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS) + repositories { + google() + mavenCentral() + } +} + +rootProject.name = "ocr-service" +include(":app") diff --git a/config/__init__.py b/config/__init__.py new file mode 100644 index 0000000..84a6cc5 --- /dev/null +++ b/config/__init__.py @@ -0,0 +1,3 @@ +from .settings import settings + +__all__ = ["settings"] diff --git a/config/settings.py b/config/settings.py new file mode 100644 index 0000000..d6922e4 --- /dev/null +++ b/config/settings.py @@ -0,0 +1,30 @@ +from pydantic_settings import BaseSettings +from typing import Optional + + +class Settings(BaseSettings): + # Device + device_serial: Optional[str] = None # None = auto-detect first device + adb_path: str = "/opt/homebrew/bin/adb" + screenshot_dir: str = "data/screenshots" + + # VLM + vlm_provider: str = "poe" # local / poe / openrouter + vlm_model: str = "Qwen/Qwen2.5-VL-7B-Instruct" + poe_api_key: Optional[str] = None + openrouter_api_key: Optional[str] = None + + # Agent + max_steps: int = 20 + action_delay: float = 1.5 # seconds to wait after each action + screenshot_timeout: float = 5.0 + verify_after_action: bool = True + + # Server + host: str = "0.0.0.0" + port: int = 4380 + + model_config = {"env_file": ".env", "env_file_encoding": "utf-8"} + + +settings = Settings() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..406078c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,15 @@ +fastapi>=0.115.0 +uvicorn>=0.32.0 +pillow>=10.0.0 +httpx>=0.27.0 +pydantic>=2.0.0 +pydantic-settings>=2.0.0 +jinja2>=3.1.0 +python-multipart>=0.0.9 + +# OCR grounding (L3 - element detection by visible text) +pytesseract>=0.3.10 # Fast, uses system tesseract binary +numpy>=1.24.0 # Required by easyocr and image processing + +# Optional: better Chinese OCR (install separately if pytesseract is insufficient) +# pip install easyocr # ~150MB download, better zh_CN but slower first run diff --git a/scripts/test_device.py b/scripts/test_device.py new file mode 100644 index 0000000..b3e93bf --- /dev/null +++ b/scripts/test_device.py @@ -0,0 +1,38 @@ +"""Quick test: check ADB device connection and take a screenshot.""" + +import sys +import os + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from src.capture import ADBCapture + + +def main(): + cap = ADBCapture() + + print("Checking device...") + info = cap.check_device() + + if not info["connected"]: + print(f"[FAIL] {info['error']}") + print() + print("Troubleshooting:") + print(" 1. USB debugging enabled on phone?") + print(" 2. Run: adb devices") + print(" 3. Accept USB debugging prompt on phone") + sys.exit(1) + + print(f"[OK] Device: {info['model']}") + print(f" Serial: {info['serial']}") + print(f" Resolution: {info['resolution']}") + print(f" All devices: {info['all_devices']}") + + print("\nTaking screenshot...") + img = cap.screenshot(save=True) + print(f"[OK] Screenshot: {img.size[0]}x{img.size[1]}") + print(f" Saved to: {cap.screenshot_dir}/") + + +if __name__ == "__main__": + main() diff --git a/scripts/test_ocr_grounding.py b/scripts/test_ocr_grounding.py new file mode 100644 index 0000000..ec331ff --- /dev/null +++ b/scripts/test_ocr_grounding.py @@ -0,0 +1,149 @@ +"""Test OCR grounding: take a screenshot and find text elements. + +Usage: + # Find a specific text on current screen + python scripts/test_ocr_grounding.py "微信" + + # Detect ALL text on screen (debug mode) + python scripts/test_ocr_grounding.py --all + + # Use a saved screenshot instead of live ADB capture + python scripts/test_ocr_grounding.py "发送" --image data/screenshots/test.png + + # Try different engines + python scripts/test_ocr_grounding.py "微信" --engine easyocr + python scripts/test_ocr_grounding.py "微信" --engine pytesseract + + # Also try uiautomator dump (hybrid mode) + python scripts/test_ocr_grounding.py "微信" --hybrid + + # Save annotated screenshot with bounding boxes drawn + python scripts/test_ocr_grounding.py --all --annotate +""" + +import sys +import os +import argparse + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from PIL import Image, ImageDraw, ImageFont +from src.grounding.ocr_grounding import OCRGrounding + + +def annotate_image(img: Image.Image, boxes, query: str = "") -> Image.Image: + """Draw bounding boxes on the image for visualization.""" + annotated = img.copy() + draw = ImageDraw.Draw(annotated) + + for box in boxes: + is_match = box.contains_text(query) if query else False + color = "red" if is_match else "lime" + width = 3 if is_match else 1 + + draw.rectangle( + [box.x, box.y, box.x + box.w, box.y + box.h], + outline=color, width=width, + ) + label = f"{box.text} ({box.confidence:.0%})" + draw.text((box.x, box.y - 14), label, fill=color) + + return annotated + + +def main(): + parser = argparse.ArgumentParser(description="Test OCR grounding on phone screen") + parser.add_argument("query", nargs="?", default=None, help="Text to find on screen") + parser.add_argument("--all", action="store_true", help="Detect all text on screen") + parser.add_argument("--image", type=str, help="Use saved screenshot instead of ADB") + parser.add_argument("--engine", type=str, default="auto", + choices=["auto", "pytesseract", "easyocr"], + help="OCR engine to use") + parser.add_argument("--hybrid", action="store_true", + help="Try uiautomator + OCR hybrid approach") + parser.add_argument("--annotate", action="store_true", + help="Save annotated screenshot with bounding boxes") + args = parser.parse_args() + + if not args.query and not args.all: + parser.error("Provide a search query or --all") + + # Get screenshot + if args.image: + print(f"Loading image: {args.image}") + img = Image.open(args.image) + else: + from src.capture import ADBCapture + cap = ADBCapture() + info = cap.check_device() + if not info["connected"]: + print(f"[FAIL] {info['error']}") + sys.exit(1) + print(f"Device: {info['model']} ({info['resolution']})") + print("Taking screenshot...") + img = cap.screenshot(save=True) + + print(f"Image size: {img.width}x{img.height}") + grounding = OCRGrounding(engine=args.engine) + + if args.all: + print(f"\n--- Detecting ALL text (engine={args.engine}) ---\n") + boxes = grounding.detect_all(img) + if not boxes: + print("[WARN] No text detected!") + else: + print(f"Found {len(boxes)} text regions:\n") + for i, box in enumerate(boxes, 1): + nx, ny = box.center_normalized(img.width, img.height) + print(f" {i:3d}. '{box.text}'") + print(f" pixel=({box.cx}, {box.cy}) " + f"norm=({nx:.3f}, {ny:.3f}) " + f"conf={box.confidence:.0%}") + + if args.annotate and boxes: + out_path = "data/screenshots/annotated_all.png" + annotated = annotate_image(img, boxes, query=args.query or "") + annotated.save(out_path) + print(f"\nAnnotated image saved: {out_path}") + + if args.query: + print(f"\n--- Searching for: '{args.query}' (engine={args.engine}) ---\n") + + if args.hybrid: + result = grounding.find_text_hybrid(img, args.query) + else: + result = grounding.find_text(img, args.query) + + if result is None: + print(f"[NOT FOUND] '{args.query}' was not found on screen.") + print("\nTip: Run with --all to see all detected text.") + sys.exit(1) + else: + nx, ny = result.center_normalized(img.width, img.height) + print(f"[FOUND] '{result.text}'") + print(f" Pixel center: ({result.cx}, {result.cy})") + print(f" Normalized center: ({nx:.4f}, {ny:.4f})") + print(f" Bounding box: x={result.x} y={result.y} " + f"w={result.w} h={result.h}") + print(f" Confidence: {result.confidence:.0%}") + print() + print(f" To tap this element:") + print(f" adb shell input tap {result.cx} {result.cy}") + + # Show all matches + all_matches = grounding.find_all_matches(img, args.query) + if len(all_matches) > 1: + print(f"\n ({len(all_matches)} total matches found)") + for i, m in enumerate(all_matches): + print(f" {i+1}. '{m.text}' at ({m.cx},{m.cy}) conf={m.confidence:.0%}") + + if args.annotate: + boxes = grounding.detect_all(img) + out_path = "data/screenshots/annotated_search.png" + annotated = annotate_image(img, boxes, query=args.query) + annotated.save(out_path) + print(f"\nAnnotated image saved: {out_path}") + + +if __name__ == "__main__": + main() diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/capture/__init__.py b/src/capture/__init__.py new file mode 100644 index 0000000..ff66899 --- /dev/null +++ b/src/capture/__init__.py @@ -0,0 +1,3 @@ +from .adb_capture import ADBCapture + +__all__ = ["ADBCapture"] diff --git a/src/capture/adb_capture.py b/src/capture/adb_capture.py new file mode 100644 index 0000000..a92beb2 --- /dev/null +++ b/src/capture/adb_capture.py @@ -0,0 +1,118 @@ +"""L1 - Screen Capture via ADB + +Captures screenshots from Android device using ADB. +Handles device connection, screenshot acquisition, and resolution detection. +""" + +import subprocess +import time +from pathlib import Path +from datetime import datetime +from PIL import Image +import io + +from config import settings + + +class ADBCapture: + """ADB-based screen capture for Android devices.""" + + def __init__(self): + self.adb = settings.adb_path + self.serial = settings.device_serial + self.screenshot_dir = Path(settings.screenshot_dir) + self.screenshot_dir.mkdir(parents=True, exist_ok=True) + self._resolution: tuple[int, int] | None = None + + def _adb_cmd(self, *args: str) -> list[str]: + cmd = [self.adb] + if self.serial: + cmd.extend(["-s", self.serial]) + cmd.extend(args) + return cmd + + def check_device(self) -> dict: + """Check if device is connected and return device info.""" + result = subprocess.run( + self._adb_cmd("devices"), + capture_output=True, text=True, timeout=5 + ) + lines = result.stdout.strip().split("\n")[1:] # skip header + devices = [] + for line in lines: + parts = line.strip().split("\t") + if len(parts) == 2 and parts[1] == "device": + devices.append(parts[0]) + + if not devices: + return {"connected": False, "error": "No device found"} + + serial = self.serial or devices[0] + if not self.serial: + self.serial = serial + + # Get device model + model_result = subprocess.run( + self._adb_cmd("shell", "getprop", "ro.product.model"), + capture_output=True, text=True, timeout=5 + ) + model = model_result.stdout.strip() + + # Get screen resolution + w, h = self.get_resolution() + + return { + "connected": True, + "serial": serial, + "model": model, + "resolution": f"{w}x{h}", + "all_devices": devices, + } + + def get_resolution(self) -> tuple[int, int]: + """Get device screen resolution.""" + if self._resolution: + return self._resolution + + result = subprocess.run( + self._adb_cmd("shell", "wm", "size"), + capture_output=True, text=True, timeout=5 + ) + # Output: "Physical size: 1080x2400" + size_str = result.stdout.strip().split(":")[-1].strip() + w, h = size_str.split("x") + self._resolution = (int(w), int(h)) + return self._resolution + + def screenshot(self, save: bool = True) -> Image.Image: + """Take a screenshot and return as PIL Image. + + Args: + save: Whether to save the screenshot to disk for debugging. + + Returns: + PIL Image of the current screen. + """ + result = subprocess.run( + self._adb_cmd("exec-out", "screencap", "-p"), + capture_output=True, timeout=settings.screenshot_timeout + ) + if result.returncode != 0: + raise RuntimeError(f"Screenshot failed: {result.stderr.decode()}") + + img = Image.open(io.BytesIO(result.stdout)) + + if save: + ts = datetime.now().strftime("%Y%m%d_%H%M%S_%f") + path = self.screenshot_dir / f"{ts}.png" + img.save(path) + + return img + + def screenshot_base64(self) -> str: + """Take screenshot and return as base64-encoded PNG string.""" + import base64 + img = self.screenshot(save=True) + buffer = io.BytesIO() + img.save(buffer, format="PNG") + return base64.b64encode(buffer.getvalue()).decode("utf-8") diff --git a/src/executor/__init__.py b/src/executor/__init__.py new file mode 100644 index 0000000..53f92bf --- /dev/null +++ b/src/executor/__init__.py @@ -0,0 +1,3 @@ +from .adb_executor import ADBExecutor + +__all__ = ["ADBExecutor"] diff --git a/src/executor/adb_executor.py b/src/executor/adb_executor.py new file mode 100644 index 0000000..88d1f7b --- /dev/null +++ b/src/executor/adb_executor.py @@ -0,0 +1,109 @@ +"""L5 - Action Execution via ADB + +Translates structured actions into ADB commands and executes them on device. +Coordinates are normalized (0-1), converted to device pixels at execution time. +""" + +import subprocess +import time +from dataclasses import dataclass + +from config import settings + + +@dataclass +class Action: + """A single GUI action to execute.""" + type: str # tap, swipe, type, long_press, back, home, scroll, wait + x: float = 0.0 # normalized x (0-1) + y: float = 0.0 # normalized y (0-1) + text: str = "" # for type action + x2: float = 0.0 # for swipe end + y2: float = 0.0 # for swipe end + duration: int = 300 # ms, for long_press and swipe + + +class ADBExecutor: + """Execute actions on Android device via ADB.""" + + def __init__(self, capture): + self.capture = capture + self.adb = settings.adb_path + self.serial = settings.device_serial + + def _adb_cmd(self, *args: str) -> list[str]: + cmd = [self.adb] + if self.serial: + cmd.extend(["-s", self.serial]) + cmd.extend(args) + return cmd + + def _run(self, *args: str): + cmd = self._adb_cmd(*args) + result = subprocess.run(cmd, capture_output=True, text=True, timeout=10) + if result.returncode != 0: + raise RuntimeError(f"ADB command failed: {' '.join(cmd)}\n{result.stderr}") + return result.stdout + + def _to_pixels(self, x: float, y: float) -> tuple[int, int]: + """Convert normalized (0-1) coordinates to device pixels.""" + w, h = self.capture.get_resolution() + return int(x * w), int(y * h) + + def execute(self, action: Action) -> str: + """Execute a single action and return a description of what was done.""" + match action.type: + case "tap": + px, py = self._to_pixels(action.x, action.y) + self._run("shell", "input", "tap", str(px), str(py)) + desc = f"tap ({px}, {py})" + + case "long_press": + px, py = self._to_pixels(action.x, action.y) + self._run("shell", "input", "swipe", + str(px), str(py), str(px), str(py), str(action.duration)) + desc = f"long_press ({px}, {py}) {action.duration}ms" + + case "swipe": + px1, py1 = self._to_pixels(action.x, action.y) + px2, py2 = self._to_pixels(action.x2, action.y2) + self._run("shell", "input", "swipe", + str(px1), str(py1), str(px2), str(py2), str(action.duration)) + desc = f"swipe ({px1},{py1}) → ({px2},{py2})" + + case "type": + # Escape special characters for ADB + escaped = action.text.replace(" ", "%s").replace("&", "\\&") + self._run("shell", "input", "text", escaped) + desc = f"type '{action.text}'" + + case "back": + self._run("shell", "input", "keyevent", "KEYCODE_BACK") + desc = "back" + + case "home": + self._run("shell", "input", "keyevent", "KEYCODE_HOME") + desc = "home" + + case "scroll": + # Scroll direction: swipe center screen + px, py = self._to_pixels(0.5, 0.5) + if action.y < 0: # scroll up + self._run("shell", "input", "swipe", + str(px), str(py - 300), str(px), str(py + 300), "300") + desc = "scroll up" + else: # scroll down + self._run("shell", "input", "swipe", + str(px), str(py + 300), str(px), str(py - 300), "300") + desc = "scroll down" + + case "wait": + time.sleep(action.duration / 1000) + desc = f"wait {action.duration}ms" + + case _: + raise ValueError(f"Unknown action type: {action.type}") + + # Wait for UI to settle after action + time.sleep(settings.action_delay) + return desc diff --git a/src/grounding/__init__.py b/src/grounding/__init__.py new file mode 100644 index 0000000..71cdd5f --- /dev/null +++ b/src/grounding/__init__.py @@ -0,0 +1,3 @@ +from .ocr_grounding import OCRGrounding + +__all__ = ["OCRGrounding"] diff --git a/src/grounding/ocr_grounding.py b/src/grounding/ocr_grounding.py new file mode 100644 index 0000000..d773ad3 --- /dev/null +++ b/src/grounding/ocr_grounding.py @@ -0,0 +1,354 @@ +"""L3 - OCR-Based UI Element Grounding + +Locates UI elements on screen by visible text using OCR on ADB screenshots. +Provides reliable text-to-coordinate mapping that works on Huawei/HarmonyOS +where uiautomator dump often returns empty XML for WeChat. + +Strategy priority (auto mode): +1. easyocr (best Chinese recognition, deep learning based) +2. pytesseract (fallback, fast but fragments Chinese characters) +3. uiautomator XML dump (supplementary, often empty on Huawei WeChat) + +All coordinates returned as normalized (0.0-1.0) for consistency with the +existing coordinate system in adb_executor.py. +""" + +import subprocess +import re +import io +import logging +from dataclasses import dataclass +from pathlib import Path +from PIL import Image + +from config import settings + +logger = logging.getLogger(__name__) + + +@dataclass +class TextBox: + """A detected text region on screen.""" + text: str + x: int # left pixel + y: int # top pixel + w: int # width pixels + h: int # height pixels + confidence: float # 0.0-1.0 + + @property + def cx(self) -> int: + """Center x in pixels.""" + return self.x + self.w // 2 + + @property + def cy(self) -> int: + """Center y in pixels.""" + return self.y + self.h // 2 + + def center_normalized(self, screen_w: int, screen_h: int) -> tuple[float, float]: + """Return center as normalized (0-1) coordinates.""" + return self.cx / screen_w, self.cy / screen_h + + def contains_text(self, query: str, fuzzy: bool = True) -> bool: + """Check if this box's text matches the query. + + Args: + query: Text to search for. + fuzzy: If True, does substring + case-insensitive match. + """ + if not query or not self.text: + return False + if fuzzy: + return query.lower() in self.text.lower() or self.text.lower() in query.lower() + return self.text == query + + def match_score(self, query: str) -> float: + """Compute a match quality score (higher = better). + + Scoring: + - Exact match: 1000 + confidence + - Query is full text: 500 + confidence + - Text contains query as substring: 100 + confidence + length_ratio + - Query contains text as substring: 50 + confidence + - No match: 0 + """ + if not query or not self.text: + return 0.0 + + q = query.lower() + t = self.text.lower().strip() + + if t == q: + return 1000 + self.confidence + if q in t: + # Prefer shorter texts that contain the query (more precise) + length_ratio = len(q) / max(len(t), 1) + return 100 + self.confidence + length_ratio + if t in q: + # Text is a subset of query -- weaker match + length_ratio = len(t) / max(len(q), 1) + return 50 + self.confidence * length_ratio + return 0.0 + + +class OCRGrounding: + """OCR-based element grounding for Android screens. + + Usage: + grounding = OCRGrounding() + + # From ADB screenshot (PIL Image) + img = capture.screenshot() + result = grounding.find_text(img, "发送") + if result: + norm_x, norm_y = result.center_normalized(img.width, img.height) + # Use norm_x, norm_y with ADBExecutor + """ + + def __init__(self, engine: str = "auto"): + """ + Args: + engine: OCR engine to use. + "pytesseract" / "easyocr" / "auto" (easyocr first, pytesseract fallback) + """ + self.engine = engine + self._easyocr_reader = None # lazy init (slow first load) + + # ────────────────────────────────────────────── + # Public API + # ────────────────────────────────────────────── + + def find_text( + self, img: Image.Image, query: str, fuzzy: bool = True + ) -> TextBox | None: + """Find a UI element by visible text and return its bounding box. + + Args: + img: PIL Image (screenshot from ADB). + query: Text to search for (e.g. "发送", "微信", "Search"). + fuzzy: Substring/case-insensitive match. + + Returns: + Best matching TextBox, or None if not found. + """ + boxes = self.detect_all(img) + matches = [b for b in boxes if b.contains_text(query, fuzzy=fuzzy)] + + if not matches: + logger.warning(f"Text '{query}' not found. Detected texts: " + f"{[b.text for b in boxes[:20]]}") + return None + + # Return best match by match_score (prefers exact/longer matches) + matches.sort(key=lambda b: b.match_score(query), reverse=True) + best = matches[0] + logger.info(f"Found '{query}' → '{best.text}' at ({best.cx}, {best.cy}) " + f"conf={best.confidence:.2f} score={best.match_score(query):.1f}") + return best + + def find_all_matches( + self, img: Image.Image, query: str, fuzzy: bool = True + ) -> list[TextBox]: + """Find ALL matching elements (e.g., multiple chat contacts named similar).""" + boxes = self.detect_all(img) + return [b for b in boxes if b.contains_text(query, fuzzy=fuzzy)] + + def detect_all(self, img: Image.Image) -> list[TextBox]: + """Run OCR on the full image and return all detected text boxes. + + Tries engines in order based on self.engine setting. + """ + if self.engine == "pytesseract": + return self._detect_pytesseract(img) + elif self.engine == "easyocr": + return self._detect_easyocr(img) + else: # auto + # Prefer easyocr (much better Chinese recognition), fall back to pytesseract + try: + return self._detect_easyocr(img) + except Exception as e: + logger.info(f"easyocr failed ({e}), trying pytesseract") + + try: + boxes = self._detect_pytesseract(img) + if boxes: + return boxes + except Exception as e: + logger.error(f"All OCR engines failed: {e}") + + return [] + + def find_text_normalized( + self, img: Image.Image, query: str, fuzzy: bool = True + ) -> tuple[float, float] | None: + """Convenience: find text and return normalized (x, y) center directly. + + Returns None if not found. + """ + box = self.find_text(img, query, fuzzy=fuzzy) + if box is None: + return None + return box.center_normalized(img.width, img.height) + + # ────────────────────────────────────────────── + # pytesseract engine + # ────────────────────────────────────────────── + + def _detect_pytesseract(self, img: Image.Image) -> list[TextBox]: + """Detect text using pytesseract (calls tesseract binary). + + Uses chi_sim+eng for Chinese + English mixed content (common in WeChat). + Falls back to eng-only if chi_sim data is not installed. + """ + import pytesseract + + # Try Chinese+English first, fall back to English only + for lang in ["chi_sim+eng", "eng"]: + try: + data = pytesseract.image_to_data( + img, + lang=lang, + output_type=pytesseract.Output.DICT, + config="--psm 11" # Sparse text: find as much text as possible + ) + break + except pytesseract.TesseractError: + continue + else: + raise RuntimeError("Tesseract failed with all language configs") + + boxes = [] + n = len(data["text"]) + for i in range(n): + text = data["text"][i].strip() + conf = int(data["conf"][i]) + if not text or conf < 20: # skip low-confidence noise + continue + boxes.append(TextBox( + text=text, + x=data["left"][i], + y=data["top"][i], + w=data["width"][i], + h=data["height"][i], + confidence=conf / 100.0, + )) + + return boxes + + # ────────────────────────────────────────────── + # easyocr engine + # ────────────────────────────────────────────── + + def _detect_easyocr(self, img: Image.Image) -> list[TextBox]: + """Detect text using easyocr (better for Chinese, uses deep learning). + + First call is slow (~10s) due to model loading. Subsequent calls are fast. + """ + import easyocr + import numpy as np + + if self._easyocr_reader is None: + self._easyocr_reader = easyocr.Reader( + ["ch_sim", "en"], + gpu=False, # CPU is fine for single screenshots + ) + + # Convert PIL to numpy array for easyocr + img_np = np.array(img.convert("RGB")) + results = self._easyocr_reader.readtext(img_np) + + boxes = [] + for (bbox, text, conf) in results: + if not text.strip(): + continue + # bbox is [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] (quadrilateral) + xs = [p[0] for p in bbox] + ys = [p[1] for p in bbox] + x = int(min(xs)) + y = int(min(ys)) + w = int(max(xs) - x) + h = int(max(ys) - y) + boxes.append(TextBox( + text=text.strip(), + x=x, y=y, w=w, h=h, + confidence=float(conf), + )) + + return boxes + + # ────────────────────────────────────────────── + # uiautomator XML dump (supplementary, often empty on Huawei) + # ────────────────────────────────────────────── + + def try_uiautomator_dump(self, serial: str | None = None) -> list[TextBox]: + """Attempt to get UI elements from uiautomator dump. + + NOTE: This often returns nearly empty XML on Huawei/HarmonyOS, + especially for WeChat. Use as a supplementary source, not primary. + + Args: + serial: Device serial (None = use settings or first device). + + Returns: + List of TextBox from accessibility tree, may be empty. + """ + adb = settings.adb_path + cmd = [adb] + if serial or settings.device_serial: + cmd.extend(["-s", serial or settings.device_serial]) + + # Dump to device, then pull + dump_cmd = cmd + ["shell", "uiautomator", "dump", "/sdcard/ui_dump.xml"] + pull_cmd = cmd + ["shell", "cat", "/sdcard/ui_dump.xml"] + + try: + subprocess.run(dump_cmd, capture_output=True, timeout=10) + result = subprocess.run(pull_cmd, capture_output=True, text=True, timeout=5) + xml_content = result.stdout + except Exception as e: + logger.warning(f"uiautomator dump failed: {e}") + return [] + + return self._parse_uiautomator_xml(xml_content) + + def _parse_uiautomator_xml(self, xml_str: str) -> list[TextBox]: + """Parse uiautomator dump XML into TextBox list.""" + boxes = [] + # Pattern: text="..." bounds="[x1,y1][x2,y2]" + pattern = r'text="([^"]*)"[^>]*bounds="\[(\d+),(\d+)\]\[(\d+),(\d+)\]"' + for match in re.finditer(pattern, xml_str): + text = match.group(1).strip() + if not text: + continue + x1, y1, x2, y2 = (int(match.group(i)) for i in range(2, 6)) + boxes.append(TextBox( + text=text, + x=x1, y=y1, + w=x2 - x1, h=y2 - y1, + confidence=1.0, # accessibility tree is authoritative + )) + return boxes + + # ────────────────────────────────────────────── + # Hybrid: combine OCR + uiautomator + # ────────────────────────────────────────────── + + def find_text_hybrid( + self, img: Image.Image, query: str, fuzzy: bool = True + ) -> TextBox | None: + """Try uiautomator first (exact bounds), fall back to OCR. + + Best strategy for Huawei: uiautomator might work for some apps, + OCR always works as fallback. + """ + # Try uiautomator first (precise but often empty on Huawei) + ua_boxes = self.try_uiautomator_dump() + ua_matches = [b for b in ua_boxes if b.contains_text(query, fuzzy=fuzzy)] + if ua_matches: + logger.info(f"Found '{query}' via uiautomator") + return ua_matches[0] + + # Fall back to OCR + logger.info(f"uiautomator found nothing for '{query}', using OCR") + return self.find_text(img, query, fuzzy=fuzzy) diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..cb18e94 --- /dev/null +++ b/src/main.py @@ -0,0 +1,122 @@ +"""Phone GUI Agent - Main Entry Point + +Web console for controlling the agent loop. +""" + +import asyncio +import json +from pathlib import Path + +from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request +from fastapi.responses import HTMLResponse +from fastapi.staticfiles import StaticFiles +from fastapi.templating import Jinja2Templates + +from config import settings +from src.capture import ADBCapture +from src.planner.agent_loop import AgentLoop + +app = FastAPI(title="Phone GUI Agent", version="0.1.0") + +BASE_DIR = Path(__file__).parent.parent +app.mount("/static", StaticFiles(directory=BASE_DIR / "web" / "static"), name="static") +templates = Jinja2Templates(directory=BASE_DIR / "web" / "templates") + +# Global state +capture = ADBCapture() +agent = AgentLoop() + + +@app.get("/", response_class=HTMLResponse) +async def index(request: Request): + return templates.TemplateResponse(request, "index.html") + + +@app.get("/api/device") +async def device_info(): + """Check device connection status.""" + try: + info = capture.check_device() + return info + except Exception as e: + return {"connected": False, "error": str(e)} + + +@app.get("/api/screenshot") +async def take_screenshot(): + """Take a screenshot and return base64.""" + try: + b64 = capture.screenshot_base64() + return {"ok": True, "image": b64} + except Exception as e: + return {"ok": False, "error": str(e)} + + +@app.post("/api/stop") +async def stop_task(): + """Stop the current running task.""" + agent.stop() + return {"ok": True} + + +@app.websocket("/ws/task") +async def task_websocket(ws: WebSocket): + """WebSocket endpoint for running tasks with real-time updates. + + Client sends: {"task": "打开微信搜索张三"} + Server streams: StepResult objects as JSON + """ + await ws.accept() + try: + data = await ws.receive_json() + task = data.get("task", "") + if not task: + await ws.send_json({"error": "No task provided"}) + return + + await ws.send_json({"status": "started", "task": task}) + + def on_step(result): + asyncio.get_event_loop().call_soon_threadsafe( + asyncio.ensure_future, + ws.send_json({ + "status": "step", + "step": result.step, + "observation": result.observation, + "thinking": result.thinking, + "action_type": result.action_type, + "action_desc": result.action_desc, + "screenshot": result.screenshot_before[:100] + "..." if result.screenshot_before else None, + "error": result.error, + }) + ) + + session = await agent.run_task(task, on_step=on_step) + + await ws.send_json({ + "status": session.status, + "total_steps": len(session.steps), + "task": task, + }) + + except WebSocketDisconnect: + agent.stop() + except Exception as e: + try: + await ws.send_json({"error": str(e)}) + except Exception: + pass + + +def main(): + import uvicorn + uvicorn.run( + "src.main:app", + host=settings.host, + port=settings.port, + reload=True, + ) + + +if __name__ == "__main__": + main() diff --git a/src/planner/__init__.py b/src/planner/__init__.py new file mode 100644 index 0000000..5491201 --- /dev/null +++ b/src/planner/__init__.py @@ -0,0 +1,3 @@ +from .agent_loop import AgentLoop + +__all__ = ["AgentLoop"] diff --git a/src/planner/agent_loop.py b/src/planner/agent_loop.py new file mode 100644 index 0000000..05bf3b4 --- /dev/null +++ b/src/planner/agent_loop.py @@ -0,0 +1,200 @@ +"""L4+L6+L7 - Agent Loop: Planning, Verification, Memory + +The core agent loop that orchestrates the full pipeline: +Screenshot → VLM Analysis → Action Execution → Verification → Repeat +""" + +import asyncio +import time +from dataclasses import dataclass, field +from datetime import datetime + +from src.capture import ADBCapture +from src.vision import VLMClient +from src.executor.adb_executor import ADBExecutor, Action + + +@dataclass +class StepResult: + step: int + timestamp: str + observation: str + thinking: str + action_type: str + action_desc: str + screenshot_before: str # base64 + screenshot_after: str | None = None + verified: bool = False + error: str | None = None + + +@dataclass +class TaskSession: + task: str + status: str = "running" # running / completed / failed / stopped + steps: list[StepResult] = field(default_factory=list) + started_at: str = "" + finished_at: str = "" + + def history(self) -> list[dict]: + """Return history for VLM context.""" + return [ + { + "observation": s.observation, + "action": {"type": s.action_type}, + } + for s in self.steps + ] + + +class AgentLoop: + """Main agent loop orchestrating all pipeline layers.""" + + def __init__(self): + self.capture = ADBCapture() + self.vlm = VLMClient() + self.executor = ADBExecutor(self.capture) + self.current_session: TaskSession | None = None + self._stop_requested = False + + def stop(self): + self._stop_requested = True + + async def run_task(self, task: str, on_step=None) -> TaskSession: + """Execute a task through the full agent loop. + + Args: + task: Natural language task instruction. + on_step: Optional callback called after each step with StepResult. + + Returns: + TaskSession with all steps and final status. + """ + from config import settings + + session = TaskSession( + task=task, + started_at=datetime.now().isoformat(), + ) + self.current_session = session + self._stop_requested = False + + try: + for step_num in range(1, settings.max_steps + 1): + if self._stop_requested: + session.status = "stopped" + break + + result = await self._execute_step(step_num, task, session) + session.steps.append(result) + + if on_step: + on_step(result) + + if result.action_type == "done": + session.status = "completed" + break + + if result.error: + # Allow up to 3 consecutive errors before failing + recent_errors = sum( + 1 for s in session.steps[-3:] if s.error + ) + if recent_errors >= 3: + session.status = "failed" + break + else: + session.status = "failed" # max steps exceeded + + except Exception as e: + session.status = "failed" + if session.steps: + session.steps[-1].error = str(e) + + session.finished_at = datetime.now().isoformat() + self.current_session = None + return session + + async def _execute_step( + self, step_num: int, task: str, session: TaskSession + ) -> StepResult: + """Execute a single step in the agent loop.""" + timestamp = datetime.now().isoformat() + + # L1: Capture screenshot + try: + screenshot_b64 = self.capture.screenshot_base64() + except Exception as e: + return StepResult( + step=step_num, timestamp=timestamp, + observation="", thinking="", + action_type="error", action_desc="", + screenshot_before="", error=f"Screenshot failed: {e}" + ) + + # L2+L3+L4: VLM analysis (understanding + grounding + planning) + try: + response = await self.vlm.analyze_screen( + screenshot_b64, task, session.history() + ) + except Exception as e: + return StepResult( + step=step_num, timestamp=timestamp, + observation="", thinking="", + action_type="error", action_desc="", + screenshot_before=screenshot_b64, + error=f"VLM analysis failed: {e}" + ) + + observation = response.get("observation", "") + thinking = response.get("thinking", "") + action_data = response["action"] + action_type = action_data["type"] + + # Task complete + if action_type == "done": + return StepResult( + step=step_num, timestamp=timestamp, + observation=observation, thinking=thinking, + action_type="done", action_desc="Task completed", + screenshot_before=screenshot_b64, + ) + + # L5: Execute action + action = Action( + type=action_type, + x=action_data.get("x", 0), + y=action_data.get("y", 0), + text=action_data.get("text", ""), + x2=action_data.get("x2", 0), + y2=action_data.get("y2", 0), + duration=action_data.get("duration", 300), + ) + + try: + action_desc = self.executor.execute(action) + except Exception as e: + return StepResult( + step=step_num, timestamp=timestamp, + observation=observation, thinking=thinking, + action_type=action_type, action_desc="", + screenshot_before=screenshot_b64, + error=f"Execution failed: {e}" + ) + + # L6: Verify by taking post-action screenshot + screenshot_after = None + if settings.verify_after_action: + try: + screenshot_after = self.capture.screenshot_base64() + except Exception: + pass # non-critical + + return StepResult( + step=step_num, timestamp=timestamp, + observation=observation, thinking=thinking, + action_type=action_type, action_desc=action_desc, + screenshot_before=screenshot_b64, + screenshot_after=screenshot_after, + verified=screenshot_after is not None, + ) diff --git a/src/verifier/__init__.py b/src/verifier/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/vision/__init__.py b/src/vision/__init__.py new file mode 100644 index 0000000..f0dedd0 --- /dev/null +++ b/src/vision/__init__.py @@ -0,0 +1,3 @@ +from .vlm_client import VLMClient + +__all__ = ["VLMClient"] diff --git a/src/vision/vlm_client.py b/src/vision/vlm_client.py new file mode 100644 index 0000000..867945f --- /dev/null +++ b/src/vision/vlm_client.py @@ -0,0 +1,171 @@ +"""L2+L3 - Vision Language Model Client + +Sends screenshots to VLM for screen understanding and element grounding. +Supports multiple providers: Poe API (preferred), OpenRouter (backup), local. +""" + +import base64 +import httpx +from PIL import Image +import io + +from config import settings + + +SYSTEM_PROMPT = """你是一个手机 GUI 操控助手。你会收到一张 Android 手机截图和一个用户任务指令。 + +你的职责: +1. 分析当前屏幕内容(识别所有 UI 元素、文本、图标、按钮) +2. 根据任务目标,决定下一步要执行的操作 +3. 精确定位目标元素的屏幕坐标 + +输出格式(严格 JSON): +{ + "observation": "当前屏幕的简要描述", + "thinking": "下一步应该做什么,为什么", + "action": { + "type": "tap|swipe|type|long_press|back|home|scroll|wait|done", + "x": 0.5, + "y": 0.3, + "text": "", + "x2": 0.0, + "y2": 0.0, + "duration": 300 + } +} + +坐标说明: +- x, y 为归一化坐标,范围 0.0-1.0 +- (0, 0) 是屏幕左上角,(1, 1) 是右下角 +- 点击按钮时,坐标应指向按钮的中心位置 + +当任务完成时,action.type 设为 "done"。 +""" + + +class VLMClient: + """Multi-provider VLM client for screen understanding.""" + + def __init__(self): + self.provider = settings.vlm_provider + self.model = settings.vlm_model + + async def analyze_screen( + self, screenshot_b64: str, task: str, history: list[dict] | None = None + ) -> dict: + """Send screenshot to VLM and get structured action response. + + Args: + screenshot_b64: Base64-encoded PNG screenshot. + task: User's task instruction. + history: Previous observation/action pairs for context. + + Returns: + Parsed dict with observation, thinking, and action. + """ + messages = self._build_messages(screenshot_b64, task, history) + + match self.provider: + case "poe": + raw = await self._call_poe(messages) + case "openrouter": + raw = await self._call_openrouter(messages) + case "local": + raw = await self._call_local(messages) + case _: + raise ValueError(f"Unknown VLM provider: {self.provider}") + + return self._parse_response(raw) + + def _build_messages( + self, screenshot_b64: str, task: str, history: list[dict] | None + ) -> list[dict]: + messages = [{"role": "system", "content": SYSTEM_PROMPT}] + + # Add history context + if history: + history_text = "\n".join( + f"Step {i+1}: {h['observation']} → {h['action']['type']}" + for i, h in enumerate(history[-5:]) # last 5 steps + ) + messages.append({ + "role": "user", + "content": f"历史操作记录:\n{history_text}" + }) + + # Current step: screenshot + task + messages.append({ + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"} + }, + { + "type": "text", + "text": f"当前任务:{task}\n\n请分析截图并给出下一步操作。" + }, + ], + }) + return messages + + async def _call_poe(self, messages: list[dict]) -> str: + """Call Poe API (preferred, cheapest).""" + async with httpx.AsyncClient(timeout=30) as client: + resp = await client.post( + "https://api.poe.com/v1/chat/completions", + headers={ + "Authorization": f"Bearer {settings.poe_api_key}", + "Content-Type": "application/json", + }, + json={"model": self.model, "messages": messages}, + ) + resp.raise_for_status() + return resp.json()["choices"][0]["message"]["content"] + + async def _call_openrouter(self, messages: list[dict]) -> str: + """Call OpenRouter API (backup).""" + async with httpx.AsyncClient(timeout=30) as client: + resp = await client.post( + "https://openrouter.ai/api/v1/chat/completions", + headers={ + "Authorization": f"Bearer {settings.openrouter_api_key}", + "Content-Type": "application/json", + }, + json={"model": self.model, "messages": messages}, + ) + resp.raise_for_status() + return resp.json()["choices"][0]["message"]["content"] + + async def _call_local(self, messages: list[dict]) -> str: + """Call local vLLM/Ollama server.""" + async with httpx.AsyncClient(timeout=60) as client: + resp = await client.post( + "http://localhost:11434/v1/chat/completions", + json={"model": self.model, "messages": messages}, + ) + resp.raise_for_status() + return resp.json()["choices"][0]["message"]["content"] + + def _parse_response(self, raw: str) -> dict: + """Parse VLM response into structured action dict.""" + import json + import re + + # Extract JSON from response (handle markdown code blocks) + json_match = re.search(r"```(?:json)?\s*(.*?)\s*```", raw, re.DOTALL) + if json_match: + raw = json_match.group(1) + + # Try to find JSON object directly + json_match = re.search(r"\{.*\}", raw, re.DOTALL) + if not json_match: + raise ValueError(f"No JSON found in VLM response: {raw[:200]}") + + parsed = json.loads(json_match.group()) + + # Validate required fields + assert "action" in parsed, "Missing 'action' field" + assert "type" in parsed["action"], "Missing action 'type'" + + return parsed diff --git a/web/templates/index.html b/web/templates/index.html new file mode 100644 index 0000000..807cb8b --- /dev/null +++ b/web/templates/index.html @@ -0,0 +1,192 @@ + + + + + + Phone GUI Agent + + + +
+
+

Phone GUI Agent

+ 检测设备中... +
+ +
+
+
+ +
+ + + +
+
+
+
+ +
+
+ +
连接设备后显示截图
+
+
+ +
+
+

Agent 思考过程

+
+
+
+
+ + + +