pytesseract测试

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from PIL import Image
#from itertools import cycle
import os, random
import pytesseract
config = "--psm 8 --oem 0 -c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyz"
def tesOCR(img):
return pytesseract.image_to_string(img, lang='eng', config=config)


class Fileset(list):
def __init__(self, name, ext='', _read=None, root=None):
if isinstance(name, str) :
self.root = os.path.join(root or os.getcwd(), name)
self.extend(f for f in os.listdir(self.root) if f.endswith(ext))
self._read = _read
def __getitem__(self, index):
if isinstance(index, int):# index是索引
return os.path.join(self.root, super().__getitem__(index))
else:# index是切片
fileset = Fileset(None)
fileset.root = self.root
fileset._read = self._read
fileset.extend(super().__getitem__(index))
return fileset
def getFileName(self, index):
fname, ext = os.path.splitext(super().__getitem__(index))
return fname
def __iter__(self):
return (os.path.join(self.root, f) for f in super().__iter__())
def __call__(self):
retn = random.choice(self)
if self._read: return self._read(retn)
else: return retn

sample = Fileset('Captcha', '.jpg', Image.open)

测试用验证码文件

与百度OCR的对比,第一个是tesseractOCR

1
2
3
4
5
6
7
#tesOCR.py
from PIL import Image
import pytesseract
from io import BytesIO

config = "--psm 8 --oem 0 -c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyz"

1
2
3
4
5
6
7
8
9
10
11
12
13
#baiduOCR.py
from aip import AipOcr
""" 你的 APPID AK SK """
APP_ID = ''
API_KEY = ''
SECRET_KEY = ''

client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
options = {}
options["language_type"] = "ENG"
options["detect_direction"] = "true"
options["detect_language"] = "true"

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#test.py
import os, random
class Fileset(list):
def __init__(self, name, ext='', _read=None, root=None):
if isinstance(name, str) :
self.root = os.path.join(root or os.getcwd(), name)
self.extend(f for f in os.listdir(self.root) if f.endswith(ext))
self._read = _read
def __getitem__(self, index):
if isinstance(index, int):# index是索引
return os.path.join(self.root, super().__getitem__(index))
else:# index是切片
fileset = Fileset(None)
fileset.root = self.root
fileset._read = self._read
fileset.extend(super().__getitem__(index))
return fileset
def getFileName(self, index):
fname, ext = os.path.splitext(super().__getitem__(index))
return fname
def __iter__(self):
return (os.path.join(self.root, f) for f in super().__iter__())
def __call__(self):
retn = random.choice(self)
if self._read: return self._read(retn)
else: return retn
def fopen(path):
with open(path, 'rb') as f:
return f.read()
sample = Fileset('Captcha', '.jpg', fopen)

OCR = input('请选择验证码识别方式(默认为tesseract, 1为百度OCR):')
if not OCR: from tesOCR import tesOCR as OCR
elif OCR == "1" : from baiduOCR import BaiduOCR as OCR
from baiduOCR import BaiduOCR


pytesseract测试
https://b.limour.top/297.html
Author
Limour
Posted on
July 2, 2020
Licensed under