"""Test OCR grounding: take a screenshot and find text elements. Usage: # Find a specific text on current screen python scripts/test_ocr_grounding.py "微信" # Detect ALL text on screen (debug mode) python scripts/test_ocr_grounding.py --all # Use a saved screenshot instead of live ADB capture python scripts/test_ocr_grounding.py "发送" --image data/screenshots/test.png # Try different engines python scripts/test_ocr_grounding.py "微信" --engine easyocr python scripts/test_ocr_grounding.py "微信" --engine pytesseract # Also try uiautomator dump (hybrid mode) python scripts/test_ocr_grounding.py "微信" --hybrid # Save annotated screenshot with bounding boxes drawn python scripts/test_ocr_grounding.py --all --annotate """ import sys import os import argparse sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from PIL import Image, ImageDraw, ImageFont from src.grounding.ocr_grounding import OCRGrounding def annotate_image(img: Image.Image, boxes, query: str = "") -> Image.Image: """Draw bounding boxes on the image for visualization.""" annotated = img.copy() draw = ImageDraw.Draw(annotated) for box in boxes: is_match = box.contains_text(query) if query else False color = "red" if is_match else "lime" width = 3 if is_match else 1 draw.rectangle( [box.x, box.y, box.x + box.w, box.y + box.h], outline=color, width=width, ) label = f"{box.text} ({box.confidence:.0%})" draw.text((box.x, box.y - 14), label, fill=color) return annotated def main(): parser = argparse.ArgumentParser(description="Test OCR grounding on phone screen") parser.add_argument("query", nargs="?", default=None, help="Text to find on screen") parser.add_argument("--all", action="store_true", help="Detect all text on screen") parser.add_argument("--image", type=str, help="Use saved screenshot instead of ADB") parser.add_argument("--engine", type=str, default="auto", choices=["auto", "pytesseract", "easyocr"], help="OCR engine to use") parser.add_argument("--hybrid", action="store_true", help="Try uiautomator + OCR hybrid approach") parser.add_argument("--annotate", action="store_true", help="Save annotated screenshot with bounding boxes") args = parser.parse_args() if not args.query and not args.all: parser.error("Provide a search query or --all") # Get screenshot if args.image: print(f"Loading image: {args.image}") img = Image.open(args.image) else: from src.capture import ADBCapture cap = ADBCapture() info = cap.check_device() if not info["connected"]: print(f"[FAIL] {info['error']}") sys.exit(1) print(f"Device: {info['model']} ({info['resolution']})") print("Taking screenshot...") img = cap.screenshot(save=True) print(f"Image size: {img.width}x{img.height}") grounding = OCRGrounding(engine=args.engine) if args.all: print(f"\n--- Detecting ALL text (engine={args.engine}) ---\n") boxes = grounding.detect_all(img) if not boxes: print("[WARN] No text detected!") else: print(f"Found {len(boxes)} text regions:\n") for i, box in enumerate(boxes, 1): nx, ny = box.center_normalized(img.width, img.height) print(f" {i:3d}. '{box.text}'") print(f" pixel=({box.cx}, {box.cy}) " f"norm=({nx:.3f}, {ny:.3f}) " f"conf={box.confidence:.0%}") if args.annotate and boxes: out_path = "data/screenshots/annotated_all.png" annotated = annotate_image(img, boxes, query=args.query or "") annotated.save(out_path) print(f"\nAnnotated image saved: {out_path}") if args.query: print(f"\n--- Searching for: '{args.query}' (engine={args.engine}) ---\n") if args.hybrid: result = grounding.find_text_hybrid(img, args.query) else: result = grounding.find_text(img, args.query) if result is None: print(f"[NOT FOUND] '{args.query}' was not found on screen.") print("\nTip: Run with --all to see all detected text.") sys.exit(1) else: nx, ny = result.center_normalized(img.width, img.height) print(f"[FOUND] '{result.text}'") print(f" Pixel center: ({result.cx}, {result.cy})") print(f" Normalized center: ({nx:.4f}, {ny:.4f})") print(f" Bounding box: x={result.x} y={result.y} " f"w={result.w} h={result.h}") print(f" Confidence: {result.confidence:.0%}") print() print(f" To tap this element:") print(f" adb shell input tap {result.cx} {result.cy}") # Show all matches all_matches = grounding.find_all_matches(img, args.query) if len(all_matches) > 1: print(f"\n ({len(all_matches)} total matches found)") for i, m in enumerate(all_matches): print(f" {i+1}. '{m.text}' at ({m.cx},{m.cy}) conf={m.confidence:.0%}") if args.annotate: boxes = grounding.detect_all(img) out_path = "data/screenshots/annotated_search.png" annotated = annotate_image(img, boxes, query=args.query) annotated.save(out_path) print(f"\nAnnotated image saved: {out_path}") if __name__ == "__main__": main()