1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
| from PIL import Image
import os, random import pytesseract config = "--psm 8 --oem 0 -c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyz" def tesOCR(img): return pytesseract.image_to_string(img, lang='eng', config=config)
class Fileset(list): def __init__(self, name, ext='', _read=None, root=None): if isinstance(name, str) : self.root = os.path.join(root or os.getcwd(), name) self.extend(f for f in os.listdir(self.root) if f.endswith(ext)) self._read = _read def __getitem__(self, index): if isinstance(index, int): return os.path.join(self.root, super().__getitem__(index)) else: fileset = Fileset(None) fileset.root = self.root fileset._read = self._read fileset.extend(super().__getitem__(index)) return fileset def getFileName(self, index): fname, ext = os.path.splitext(super().__getitem__(index)) return fname def __iter__(self): return (os.path.join(self.root, f) for f in super().__iter__()) def __call__(self): retn = random.choice(self) if self._read: return self._read(retn) else: return retn
sample = Fileset('Captcha', '.jpg', Image.open)
|