--- title: pytesseract测试 tags: - Captcha - OCR - Python - tesseract id: '297' categories: - - Python练习 date: 2020-07-02 19:37:35 --- ``` from PIL import Image #from itertools import cycle import os, random import pytesseract config = "--psm 8 --oem 0 -c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyz" def tesOCR(img): return pytesseract.image_to_string(img, lang='eng', config=config) class Fileset(list): def __init__(self, name, ext='', _read=None, root=None): if isinstance(name, str) : self.root = os.path.join(root or os.getcwd(), name) self.extend(f for f in os.listdir(self.root) if f.endswith(ext)) self._read = _read def __getitem__(self, index): if isinstance(index, int):# index是索引 return os.path.join(self.root, super().__getitem__(index)) else:# index是切片 fileset = Fileset(None) fileset.root = self.root fileset._read = self._read fileset.extend(super().__getitem__(index)) return fileset def getFileName(self, index): fname, ext = os.path.splitext(super().__getitem__(index)) return fname def __iter__(self): return (os.path.join(self.root, f) for f in super().__iter__()) def __call__(self): retn = random.choice(self) if self._read: return self._read(retn) else: return retn sample = Fileset('Captcha', '.jpg', Image.open) ``` [![]()](https://limour.lanzous.com/iqHJfdxripg) 测试用验证码文件 ![](https://img-cdn.limour.top/blog_wp/2020/07/微信图片_20200702203134.png) 与百度OCR的对比,第一个是tesseractOCR ``` #tesOCR.py from PIL import Image import pytesseract from io import BytesIO config = "--psm 8 --oem 0 -c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyz" ``` ``` #baiduOCR.py from aip import AipOcr """ 你的 APPID AK SK """ APP_ID = '' API_KEY = '' SECRET_KEY = '' client = AipOcr(APP_ID, API_KEY, SECRET_KEY) options = {} options["language_type"] = "ENG" options["detect_direction"] = "true" options["detect_language"] = "true" ``` ``` #test.py import os, random class Fileset(list): def __init__(self, name, ext='', _read=None, root=None): if isinstance(name, str) : self.root = os.path.join(root or os.getcwd(), name) self.extend(f for f in os.listdir(self.root) if f.endswith(ext)) self._read = _read def __getitem__(self, index): if isinstance(index, int):# index是索引 return os.path.join(self.root, super().__getitem__(index)) else:# index是切片 fileset = Fileset(None) fileset.root = self.root fileset._read = self._read fileset.extend(super().__getitem__(index)) return fileset def getFileName(self, index): fname, ext = os.path.splitext(super().__getitem__(index)) return fname def __iter__(self): return (os.path.join(self.root, f) for f in super().__iter__()) def __call__(self): retn = random.choice(self) if self._read: return self._read(retn) else: return retn def fopen(path): with open(path, 'rb') as f: return f.read() sample = Fileset('Captcha', '.jpg', fopen) OCR = input('请选择验证码识别方式(默认为tesseract, 1为百度OCR):') if not OCR: from tesOCR import tesOCR as OCR elif OCR == "1" : from baiduOCR import BaiduOCR as OCR from baiduOCR import BaiduOCR ```