TomatoOCR

第三方高精度 OCR 引擎，支持中英日韩多语种识别。

该 OCR 为付费插件，许可证及资费请访问官网 52tomato.com

from ascript.ios.plug.TomatoOCR import TomatoOCR
from ascript.ios.system import R

基本配置

LICENSE = "你的license密钥"
MODEL_PATH = R.res("model")  # 模型目录，需包含 det.opt, cls.opt, rec.opt 等文件

模型下载

模型文件下载地址：点击下载

下载后将模型文件放入工程的 res/model/ 目录下。

构造参数

ocr = TomatoOCR(
    rec_type="ch-3.0",   # 识别语言: "ch"(中文1.0), "ch-2.0", "ch-3.0"(推荐), "cht"(繁体), "japan", "korean"
    box_type="rect",     # 检测框类型: "rect"(矩形,手机推荐), "quad"(四边形,倾斜文本)
    ratio=1.9,           # 检测扩展比例: 1.6-2.5, 值越大检测框越大
    threshold=0.3,       # 识别置信度阈值: 0.1-0.9, 低于此值的结果被过滤
    return_type="json",  # 返回类型: "json"(完整信息), "text"(纯文字), "num"(纯数字)
    binary=0,            # 二值化阈值: 0-255, 0=不启用
    run_mode="fast",     # 运行模式: "slow"(高精度), "normal", "fast"(高速度)
)
ocr.setModelPath(MODEL_PATH)
ocr.setLicense(LICENSE)

参数	类型	默认值	说明
rec_type	str	"ch-3.0"	识别语言：ch / ch-2.0 / ch-3.0(推荐) / cht(繁体) / japan / korean
box_type	str	"rect"	检测框：rect(矩形) / quad(四边形，倾斜文本)
ratio	float	1.9	检测扩展比例，1.6-2.5
threshold	float	0.3	置信度阈值，0.1-0.9
return_type	str	"json"	返回类型：json / text / num
binary	int	0	二值化阈值，0=不启用，0-255
scale_ratio	float	1.0	检测缩放比例
run_mode	str	"slow"	运行模式：slow(高精度) / normal / fast(高速度)

返回数据结构

每条识别结果：

{
    "result": (center_x, center_y),   # 文字中心点坐标
    "rect": (x1, y1, x2, y2),        # 文字矩形区域
    "center_x": 155.5,               # 中心点 X
    "center_y": 44.0,                # 中心点 Y
    "confidence": 0.95,              # 置信度 (0-1)
    "text": "识别到的文字"             # 识别文本
}

分步调用

全屏截图识别

ocr = TomatoOCR(rec_type="ch-3.0", run_mode="fast")
ocr.setModelPath(MODEL_PATH)
ocr.setLicense(LICENSE)

data = ocr.ocrCapture()
for item in data:
    print(f"文字: {item['text']}  坐标: {item['result']}  置信度: {item['confidence']:.2f}")

指定区域识别

# rect 参数: [x, y, x2, y2]
data = ocr.ocrCapture(rect=[0, 0, 400, 200])
for item in data:
    print(f"文字: {item['text']}  区域: {item['rect']}")

识别图片文件

from ascript.ios import screen

screen.capture().save(R.res("test.png"))
data = ocr.ocrFile(R.res("test.png"))
for item in data:
    print(f"文字: {item['text']}")

识别 Bitmap 图片对象

from ascript.ios import screen

bitmap = screen.capture()  # 返回 PIL Image
data = ocr.ocrBitmap(bitmap)
for item in data:
    print(f"文字: {item['text']}")

查找文字位置

# 先全屏识别一次（findTapPoint 基于最近一次识别结果）
ocr.ocrCapture()

# 查找单个文字位置
point = ocr.findTapPoint("设置")
# 返回: {"x": 100, "y": 200} 或 None

# 查找多个匹配位置
points = ocr.findTapPoints("确定")

find_all 一步调用（推荐）

一步完成初始化 + 识别，适合快速调用。

TomatoOCR.find_all(
    license: str,          # (必填) 许可证密钥
    model_path: str,       # (必填) 模型目录路径
    mode="dev",            # 运行环境: "dev"(开发) / "prod"(生产)
    rec_type="ch-3.0",    # 识别语言
    box_type="rect",       # 检测框类型
    ratio=1.9,             # 检测扩展比例
    scale_ratio=1.0,       # 检测缩放比例
    threshold=0.3,         # 置信度阈值
    return_type="json",    # 返回类型
    binary=0,              # 二值化阈值
    run_mode="slow",       # 运行模式: "slow"(高精度) / "normal" / "fast"(高速度)
    ocr_type=3,            # 0:只检测 1:方向分类+识别 2:只识别 3:检测+识别(默认)
    bitmap=None,           # PIL Image 对象
    file=None,             # 图片文件路径
    capture=[],            # 截图区域 [x, y, x2, y2]
    text="",               # 查找单个文字，返回中心点坐标
    texts="",              # 查找多个匹配，返回所有位置
    remark="",             # 备注信息
    http_interval_time=28800,  # HTTP 校验间隔(秒)，默认8小时
)

bitmap / file / capture 三选一

capture=[0,0,300,100] — 截取屏幕指定区域识别
file=R.res("test.png") — 识别图片文件
bitmap=image — 识别 PIL Image 对象
都不传 — 全屏截图识别

全屏识别

data = TomatoOCR.find_all(
    license=LICENSE,
    model_path=MODEL_PATH,
    rec_type="ch-3.0",
    run_mode="fast",
)
for item in data:
    print(item['text'])

指定区域识别

data = TomatoOCR.find_all(
    license=LICENSE,
    model_path=MODEL_PATH,
    capture=[100, 200, 500, 800],
)

识别图片文件

from ascript.ios import screen
screen.capture().save(R.res("test.png"))

data = TomatoOCR.find_all(
    license=LICENSE,
    model_path=MODEL_PATH,
    file=R.res("test.png"),
)

查找文字并定位

# 查找单个文字，返回中心点坐标 {"x": ..., "y": ...} 或 None
point = TomatoOCR.find_all(
    license=LICENSE,
    model_path=MODEL_PATH,
    text="设置",
)
print("位置:", point)

# 查找所有匹配
points = TomatoOCR.find_all(
    license=LICENSE,
    model_path=MODEL_PATH,
    texts="确定",
)
print("所有匹配:", points)

实战：找到文字并点击

from ascript.ios import action
from ascript.ios.plug.TomatoOCR import TomatoOCR
from ascript.ios.system import R

LICENSE = "你的license"
MODEL_PATH = R.res("model")

point = TomatoOCR.find_all(
    license=LICENSE,
    model_path=MODEL_PATH,
    text="同意",
)
if point:
    action.tap(int(point['x']), int(point['y']))
    print("已点击: 同意")
else:
    print("未找到: 同意")

TomatoOCR

基本配置​

模型下载​

构造参数​

返回数据结构​

分步调用​

全屏截图识别​

指定区域识别​

识别图片文件​

识别 Bitmap 图片对象​

查找文字位置​

find_all 一步调用（推荐）​

全屏识别​

指定区域识别​

识别图片文件​

查找文字并定位​

实战：找到文字并点击​

基本配置

模型下载

构造参数

返回数据结构

分步调用

全屏截图识别

指定区域识别

识别图片文件

识别 Bitmap 图片对象

查找文字位置

find_all 一步调用（推荐）

全屏识别

指定区域识别

识别图片文件

查找文字并定位

实战：找到文字并点击