init repo
This commit is contained in:
54
grab-captchas.mjs
Normal file
54
grab-captchas.mjs
Normal file
@@ -0,0 +1,54 @@
|
||||
/**
|
||||
* 抓取 10 张验证码图片,用于分析 OCR 准确度
|
||||
*/
|
||||
import { chromium } from 'playwright';
|
||||
import { mkdirSync } from 'fs';
|
||||
import { execSync } from 'child_process';
|
||||
import path from 'path';
|
||||
import { fileURLToPath } from 'url';
|
||||
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||
const DIR = path.join(__dirname, 'captcha-samples');
|
||||
mkdirSync(DIR, { recursive: true });
|
||||
|
||||
const browser = await chromium.launch({ headless: true, args: ['--no-sandbox'] });
|
||||
|
||||
for (let i = 1; i <= 10; i++) {
|
||||
const ctx = await browser.newContext({
|
||||
viewport: { width: 1920, height: 1080 },
|
||||
locale: 'zh-CN',
|
||||
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
||||
});
|
||||
const page = await ctx.newPage();
|
||||
|
||||
await page.goto('https://www.dianxiaomi.com/home.htm', { waitUntil: 'load', timeout: 30000 });
|
||||
await page.waitForSelector('#verifyImgCode', { timeout: 10000 });
|
||||
await page.waitForFunction(() => document.getElementById('verifyImgCode')?.complete === true, { timeout: 5000 }).catch(() => {});
|
||||
await page.waitForTimeout(1000);
|
||||
|
||||
const el = await page.$('#verifyImgCode');
|
||||
const imgPath = path.join(DIR, `captcha_${i}.png`);
|
||||
await el.screenshot({ path: imgPath });
|
||||
|
||||
// ddddocr
|
||||
let ddddResult = '';
|
||||
try {
|
||||
ddddResult = execSync(`python3 ocr_captcha.py "${imgPath}"`, { encoding: 'utf-8', timeout: 30000 }).trim();
|
||||
} catch { ddddResult = 'FAIL'; }
|
||||
|
||||
// tesseract (方案0: 灰度+放大+阈值)
|
||||
let tessResult = '';
|
||||
try {
|
||||
tessResult = execSync(
|
||||
`tesseract "${imgPath}" stdout --psm 7 -c tessedit_char_whitelist=0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ`,
|
||||
{ encoding: 'utf-8', timeout: 10000 }
|
||||
).trim().replace(/[\s\n\r]/g, '');
|
||||
} catch { tessResult = 'FAIL'; }
|
||||
|
||||
console.log(`#${i}: ddddocr="${ddddResult}" tesseract="${tessResult}" -> ${imgPath}`);
|
||||
|
||||
await ctx.close();
|
||||
}
|
||||
|
||||
await browser.close();
|
||||
console.log('\n>> 完成,请查看 captcha-samples/ 目录');
|
||||
Reference in New Issue
Block a user