auto-save 2026-04-01 09:03 (+8, ~2)

2026-04-01 09:04:04 +08:00
parent 0ddaa889de
commit 9709573870
70 changed files with 2331 additions and 9 deletions
--- a/scripts/test_device.py
+++ b/scripts/test_device.py
@@ -0,0 +1,38 @@
+"""Quick test: check ADB device connection and take a screenshot."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from src.capture import ADBCapture
+
+
+def main():
+    cap = ADBCapture()
+
+    print("Checking device...")
+    info = cap.check_device()
+
+    if not info["connected"]:
+        print(f"[FAIL] {info['error']}")
+        print()
+        print("Troubleshooting:")
+        print("  1. USB debugging enabled on phone?")
+        print("  2. Run: adb devices")
+        print("  3. Accept USB debugging prompt on phone")
+        sys.exit(1)
+
+    print(f"[OK] Device: {info['model']}")
+    print(f"     Serial: {info['serial']}")
+    print(f"     Resolution: {info['resolution']}")
+    print(f"     All devices: {info['all_devices']}")
+
+    print("\nTaking screenshot...")
+    img = cap.screenshot(save=True)
+    print(f"[OK] Screenshot: {img.size[0]}x{img.size[1]}")
+    print(f"     Saved to: {cap.screenshot_dir}/")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/test_ocr_grounding.py
+++ b/scripts/test_ocr_grounding.py
@@ -0,0 +1,149 @@
+"""Test OCR grounding: take a screenshot and find text elements.
+
+Usage:
+    # Find a specific text on current screen
+    python scripts/test_ocr_grounding.py "微信"
+
+    # Detect ALL text on screen (debug mode)
+    python scripts/test_ocr_grounding.py --all
+
+    # Use a saved screenshot instead of live ADB capture
+    python scripts/test_ocr_grounding.py "发送" --image data/screenshots/test.png
+
+    # Try different engines
+    python scripts/test_ocr_grounding.py "微信" --engine easyocr
+    python scripts/test_ocr_grounding.py "微信" --engine pytesseract
+
+    # Also try uiautomator dump (hybrid mode)
+    python scripts/test_ocr_grounding.py "微信" --hybrid
+
+    # Save annotated screenshot with bounding boxes drawn
+    python scripts/test_ocr_grounding.py --all --annotate
+"""
+
+import sys
+import os
+import argparse
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from PIL import Image, ImageDraw, ImageFont
+from src.grounding.ocr_grounding import OCRGrounding
+
+
+def annotate_image(img: Image.Image, boxes, query: str = "") -> Image.Image:
+    """Draw bounding boxes on the image for visualization."""
+    annotated = img.copy()
+    draw = ImageDraw.Draw(annotated)
+
+    for box in boxes:
+        is_match = box.contains_text(query) if query else False
+        color = "red" if is_match else "lime"
+        width = 3 if is_match else 1
+
+        draw.rectangle(
+            [box.x, box.y, box.x + box.w, box.y + box.h],
+            outline=color, width=width,
+        )
+        label = f"{box.text} ({box.confidence:.0%})"
+        draw.text((box.x, box.y - 14), label, fill=color)
+
+    return annotated
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Test OCR grounding on phone screen")
+    parser.add_argument("query", nargs="?", default=None, help="Text to find on screen")
+    parser.add_argument("--all", action="store_true", help="Detect all text on screen")
+    parser.add_argument("--image", type=str, help="Use saved screenshot instead of ADB")
+    parser.add_argument("--engine", type=str, default="auto",
+                       choices=["auto", "pytesseract", "easyocr"],
+                       help="OCR engine to use")
+    parser.add_argument("--hybrid", action="store_true",
+                       help="Try uiautomator + OCR hybrid approach")
+    parser.add_argument("--annotate", action="store_true",
+                       help="Save annotated screenshot with bounding boxes")
+    args = parser.parse_args()
+
+    if not args.query and not args.all:
+        parser.error("Provide a search query or --all")
+
+    # Get screenshot
+    if args.image:
+        print(f"Loading image: {args.image}")
+        img = Image.open(args.image)
+    else:
+        from src.capture import ADBCapture
+        cap = ADBCapture()
+        info = cap.check_device()
+        if not info["connected"]:
+            print(f"[FAIL] {info['error']}")
+            sys.exit(1)
+        print(f"Device: {info['model']} ({info['resolution']})")
+        print("Taking screenshot...")
+        img = cap.screenshot(save=True)
+
+    print(f"Image size: {img.width}x{img.height}")
+    grounding = OCRGrounding(engine=args.engine)
+
+    if args.all:
+        print(f"\n--- Detecting ALL text (engine={args.engine}) ---\n")
+        boxes = grounding.detect_all(img)
+        if not boxes:
+            print("[WARN] No text detected!")
+        else:
+            print(f"Found {len(boxes)} text regions:\n")
+            for i, box in enumerate(boxes, 1):
+                nx, ny = box.center_normalized(img.width, img.height)
+                print(f"  {i:3d}. '{box.text}'")
+                print(f"       pixel=({box.cx}, {box.cy})  "
+                      f"norm=({nx:.3f}, {ny:.3f})  "
+                      f"conf={box.confidence:.0%}")
+
+        if args.annotate and boxes:
+            out_path = "data/screenshots/annotated_all.png"
+            annotated = annotate_image(img, boxes, query=args.query or "")
+            annotated.save(out_path)
+            print(f"\nAnnotated image saved: {out_path}")
+
+    if args.query:
+        print(f"\n--- Searching for: '{args.query}' (engine={args.engine}) ---\n")
+
+        if args.hybrid:
+            result = grounding.find_text_hybrid(img, args.query)
+        else:
+            result = grounding.find_text(img, args.query)
+
+        if result is None:
+            print(f"[NOT FOUND] '{args.query}' was not found on screen.")
+            print("\nTip: Run with --all to see all detected text.")
+            sys.exit(1)
+        else:
+            nx, ny = result.center_normalized(img.width, img.height)
+            print(f"[FOUND] '{result.text}'")
+            print(f"  Pixel center:      ({result.cx}, {result.cy})")
+            print(f"  Normalized center:  ({nx:.4f}, {ny:.4f})")
+            print(f"  Bounding box:      x={result.x} y={result.y} "
+                  f"w={result.w} h={result.h}")
+            print(f"  Confidence:        {result.confidence:.0%}")
+            print()
+            print(f"  To tap this element:")
+            print(f"    adb shell input tap {result.cx} {result.cy}")
+
+        # Show all matches
+        all_matches = grounding.find_all_matches(img, args.query)
+        if len(all_matches) > 1:
+            print(f"\n  ({len(all_matches)} total matches found)")
+            for i, m in enumerate(all_matches):
+                print(f"    {i+1}. '{m.text}' at ({m.cx},{m.cy}) conf={m.confidence:.0%}")
+
+        if args.annotate:
+            boxes = grounding.detect_all(img)
+            out_path = "data/screenshots/annotated_search.png"
+            annotated = annotate_image(img, boxes, query=args.query)
+            annotated.save(out_path)
+            print(f"\nAnnotated image saved: {out_path}")
+
+
+if __name__ == "__main__":
+    main()