auto-save 2026-04-01 09:03 (+8, ~2)

This commit is contained in:
2026-04-01 09:04:04 +08:00
parent 0ddaa889de
commit 9709573870
70 changed files with 2331 additions and 9 deletions

View File

@@ -0,0 +1,43 @@
plugins {
id("com.android.application")
id("org.jetbrains.kotlin.android")
}
android {
namespace = "com.guiagent.ocr"
compileSdk = 31
defaultConfig {
applicationId = "com.guiagent.ocr"
minSdk = 26
targetSdk = 31
versionCode = 1
versionName = "1.0"
}
buildTypes {
release {
isMinifyEnabled = false
}
}
compileOptions {
sourceCompatibility = JavaVersion.VERSION_1_8
targetCompatibility = JavaVersion.VERSION_1_8
}
kotlinOptions {
jvmTarget = "1.8"
}
}
dependencies {
// ML Kit Text Recognition - bundled model (no GMS needed!)
implementation("com.google.mlkit:text-recognition-chinese:16.0.0")
// HTTP server
implementation("org.nanohttpd:nanohttpd:2.3.1")
// JSON
implementation("com.google.code.gson:gson:2.10.1")
}

View File

@@ -0,0 +1,28 @@
<?xml version="1.0" encoding="utf-8"?>
<manifest xmlns:android="http://schemas.android.com/apk/res/android">
<uses-permission android:name="android.permission.READ_EXTERNAL_STORAGE"/>
<uses-permission android:name="android.permission.INTERNET"/>
<uses-permission android:name="android.permission.FOREGROUND_SERVICE"/>
<application
android:allowBackup="false"
android:label="OCR Service"
android:supportsRtl="true">
<activity
android:name=".MainActivity"
android:exported="true">
<intent-filter>
<action android:name="android.intent.action.MAIN"/>
<category android:name="android.intent.category.LAUNCHER"/>
</intent-filter>
</activity>
<service
android:name=".OcrService"
android:exported="true"
android:foregroundServiceType="dataSync"/>
</application>
</manifest>

View File

@@ -0,0 +1,23 @@
package com.guiagent.ocr
import android.app.Activity
import android.content.Intent
import android.os.Bundle
import android.widget.TextView
class MainActivity : Activity() {
override fun onCreate(savedInstanceState: Bundle?) {
super.onCreate(savedInstanceState)
val tv = TextView(this).apply {
text = "OCR Service\nPort: 18900\nStarting..."
textSize = 20f
setPadding(40, 40, 40, 40)
}
setContentView(tv)
// Start the service
val intent = Intent(this, OcrService::class.java)
startForegroundService(intent)
tv.text = "OCR Service\nPort: 18900\nRunning!"
}
}

View File

@@ -0,0 +1,79 @@
package com.guiagent.ocr
import android.graphics.Bitmap
import android.graphics.BitmapFactory
import com.google.mlkit.vision.common.InputImage
import com.google.mlkit.vision.text.TextRecognition
import com.google.mlkit.vision.text.chinese.ChineseTextRecognizerOptions
import java.io.File
import java.util.concurrent.CountDownLatch
import java.util.concurrent.TimeUnit
data class TextBox(
val text: String,
val x: Int,
val y: Int,
val w: Int,
val h: Int,
val confidence: Float
) {
val cx get() = x + w / 2
val cy get() = y + h / 2
}
object OcrEngine {
private val recognizer by lazy {
TextRecognition.getClient(ChineseTextRecognizerOptions.Builder().build())
}
fun recognize(imagePath: String): List<TextBox> {
val file = File(imagePath)
if (!file.exists()) return emptyList()
val bitmap = BitmapFactory.decodeFile(imagePath) ?: return emptyList()
return recognizeBitmap(bitmap)
}
/** 直接截屏并识别,不落盘 */
fun screencapAndRecognize(): List<TextBox> {
val process = Runtime.getRuntime().exec("screencap -p")
val bytes = process.inputStream.readBytes()
process.waitFor()
if (bytes.isEmpty()) return emptyList()
val bitmap = BitmapFactory.decodeByteArray(bytes, 0, bytes.size) ?: return emptyList()
return recognizeBitmap(bitmap)
}
fun recognizeBitmap(bitmap: Bitmap): List<TextBox> {
val image = InputImage.fromBitmap(bitmap, 0)
val results = mutableListOf<TextBox>()
val latch = CountDownLatch(1)
recognizer.process(image)
.addOnSuccessListener { visionText ->
for (block in visionText.textBlocks) {
for (line in block.lines) {
val box = line.boundingBox ?: continue
results.add(
TextBox(
text = line.text,
x = box.left,
y = box.top,
w = box.width(),
h = box.height(),
confidence = line.confidence ?: 0.8f
)
)
}
}
latch.countDown()
}
.addOnFailureListener {
latch.countDown()
}
latch.await(10, TimeUnit.SECONDS)
bitmap.recycle()
return results
}
}

View File

@@ -0,0 +1,88 @@
package com.guiagent.ocr
import android.graphics.BitmapFactory
import com.google.gson.Gson
import fi.iki.elonen.NanoHTTPD
import java.io.ByteArrayOutputStream
class OcrHttpServer(port: Int = 18900) : NanoHTTPD(port) {
private val gson = Gson()
private val defaultPath = "/sdcard/ocr_screen.png"
override fun serve(session: IHTTPSession): Response {
return when (session.uri) {
"/ocr" -> handleOcr(session)
"/snap" -> handleSnap(session)
"/health" -> jsonResponse(mapOf("status" to "ok", "engine" to "mlkit-chinese"))
else -> newFixedLengthResponse(Response.Status.NOT_FOUND, MIME_PLAINTEXT, "404")
}
}
/** 读文件方式 OCR */
private fun handleOcr(session: IHTTPSession): Response {
val params = session.parms ?: emptyMap()
val imagePath = params["path"] ?: defaultPath
return doOcr(params["text"]) { OcrEngine.recognize(imagePath) }
}
/** POST 图片数据直接 OCR不存文件 */
private fun handleSnap(session: IHTTPSession): Response {
val params = session.parms ?: emptyMap()
if (session.method == Method.POST) {
// NanoHTTPD parseBody 将 binary data 存到临时文件
val bodyFiles = HashMap<String, String>()
session.parseBody(bodyFiles)
// postData 键对应临时文件路径
val tmpPath = bodyFiles["postData"]
if (tmpPath != null) {
val imageBytes = java.io.File(tmpPath).readBytes()
val bitmap = BitmapFactory.decodeByteArray(imageBytes, 0, imageBytes.size)
if (bitmap != null) {
return doOcr(params["text"]) { OcrEngine.recognizeBitmap(bitmap) }
}
return jsonResponse(mapOf("error" to "decode failed", "size" to imageBytes.size, "count" to 0))
}
return jsonResponse(mapOf("error" to "no body received", "count" to 0))
}
// GET: 读文件方式 fallback
return handleOcr(session)
}
private fun doOcr(query: String?, recognize: () -> List<TextBox>): Response {
val startTime = System.currentTimeMillis()
var results = recognize()
if (!query.isNullOrBlank()) {
results = results.filter { it.text.contains(query) }
}
val elapsed = System.currentTimeMillis() - startTime
val response = mapOf(
"results" to results.map { box ->
mapOf(
"text" to box.text,
"x" to box.x,
"y" to box.y,
"w" to box.w,
"h" to box.h,
"cx" to box.cx,
"cy" to box.cy,
"confidence" to box.confidence
)
},
"count" to results.size,
"elapsed_ms" to elapsed
)
return jsonResponse(response)
}
private fun jsonResponse(data: Any): Response {
val json = gson.toJson(data)
return newFixedLengthResponse(Response.Status.OK, "application/json", json)
}
}

View File

@@ -0,0 +1,49 @@
package com.guiagent.ocr
import android.app.*
import android.content.Intent
import android.os.Build
import android.os.IBinder
import android.util.Log
class OcrService : Service() {
private var server: OcrHttpServer? = null
private val TAG = "OcrService"
private val PORT = 18900
override fun onStartCommand(intent: Intent?, flags: Int, startId: Int): Int {
startForegroundNotification()
if (server == null) {
server = OcrHttpServer(PORT).also {
it.start()
Log.i(TAG, "OCR HTTP server started on port $PORT")
}
}
return START_STICKY
}
override fun onDestroy() {
server?.stop()
server = null
Log.i(TAG, "OCR HTTP server stopped")
super.onDestroy()
}
override fun onBind(intent: Intent?): IBinder? = null
private fun startForegroundNotification() {
val channelId = "ocr_service"
if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) {
val channel = NotificationChannel(channelId, "OCR Service", NotificationManager.IMPORTANCE_LOW)
getSystemService(NotificationManager::class.java).createNotificationChannel(channel)
}
val notification = Notification.Builder(this, channelId)
.setContentTitle("OCR Service")
.setContentText("Running on port $PORT")
.setSmallIcon(android.R.drawable.ic_menu_camera)
.build()
startForeground(1, notification)
}
}

View File

@@ -0,0 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<resources>
<string name="app_name">OCR Service</string>
</resources>