You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
158 lines
4.3 KiB
158 lines
4.3 KiB
# BUILT-INS |
|
import os |
|
import re |
|
from PIL import Image |
|
|
|
# VENDOR |
|
import cv2 |
|
import numpy as np |
|
import pytesseract |
|
import pdf2image |
|
|
|
|
|
def get_grayscale (img): |
|
return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) |
|
|
|
|
|
def binarize (img): |
|
return cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2) |
|
|
|
|
|
def remove_noise (img): |
|
return cv2.medianBlur(img, 7) |
|
|
|
|
|
def thresholding (img): |
|
return cv2.threshold(img, .0, 255., cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] |
|
|
|
|
|
def dilate (img): |
|
kernel = np.ones((3, 3), np.uint8) |
|
return cv2.dilate(img, kernel, iterations=1) |
|
|
|
|
|
def erode (img): |
|
kernel = np.ones((1, 1), np.uint8) |
|
return cv2.erode(img, kernel, iterations=1) |
|
|
|
|
|
def opening (img): |
|
kernel = np.ones((3, 3), np.uint8) |
|
return cv2.morphologyEx(img, cv2.MORPH_OPEN, kernel) |
|
|
|
|
|
def canny (img): |
|
return cv2.Canny(img, 100, 200) |
|
|
|
|
|
def deskew (img): |
|
coords = np.column_stack(np.where(img > 0)) |
|
angle = cv2.minAreaRect(coords)[1] |
|
if angle < 45: |
|
angle = -(90 + angle) |
|
else: |
|
angle = -angle |
|
(h, w) = img.shape[:2] |
|
center = (w // 2, h // 2) |
|
M = cv2.getRotationMatrix2D(center, angle, 1.) |
|
rotated = cv2.wrapAffine(img, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE) |
|
return rotated |
|
|
|
|
|
def match_template (img, template): |
|
return cv2.matchTemplate(img, template, cv2.TM_CCOEFF_NORMED) |
|
|
|
|
|
def get_rotation (img): |
|
osd = pytesseract.image_to_osd(img) |
|
angle = re.search(r"(?<=Rotate: )\d+", osd) |
|
return angle |
|
|
|
|
|
def pdf_to_images (file_path): |
|
directory = os.path.relpath(os.path.join(os.path.abspath(os.path.dirname(__file__)), "../images")) |
|
subdirectory = os.path.join(directory, os.path.basename(os.path.splitext(file_path)[0])) |
|
if os.path.isdir(subdirectory): |
|
for file_name in os.listdir(subdirectory): |
|
os.remove(os.path.join(subdirectory, file_name)) |
|
os.rmdir(subdirectory) |
|
|
|
os.mkdir(subdirectory) |
|
|
|
imgs = pdf2image.convert_from_path(file_path, dpi=300) |
|
img_paths = list() |
|
for i, img in enumerate(imgs): |
|
img_path = os.path.join(subdirectory, f"{i}.png") |
|
img.save(img_path) |
|
img_paths.append(img_path) |
|
|
|
return img_paths |
|
|
|
|
|
class ImageParser (object): |
|
|
|
def __init__(self, file_path): |
|
if not file_path or type(file_path) != str: |
|
raise ValueError("file_path arguments is not a valid type") |
|
elif not os.path.isfile(file_path): |
|
raise FileExistsError("Can't find nothing at the end of the path") |
|
|
|
self.file_path = file_path |
|
self.file_name = os.path.basename(file_path) |
|
self.images = [cv2.imread(img_path) for img_path in pdf_to_images(file_path)] |
|
self.preprocess() |
|
|
|
@property |
|
def text (self): |
|
text = "" |
|
for img in self.images: |
|
text += "\n" + re.sub(r"(\n+| +)", " ", pytesseract.image_to_string(img, lang="spa")) |
|
|
|
return text |
|
|
|
def preprocess (self): |
|
preprocessed = [] |
|
for img in self.images: |
|
img = get_grayscale(img) |
|
img = remove_noise(img) |
|
img = binarize(img) |
|
img = opening(img) |
|
# img = erode(img) |
|
# img = dilate(img) |
|
# self.show_image("Test", img) |
|
|
|
preprocessed.append(img) |
|
|
|
self.images = preprocessed |
|
|
|
return |
|
# print(get_rotation(img)) |
|
deskewed = deskew(img) |
|
self.show_image("Deskewed", deskewed) |
|
self.show_image("Gray Scale", grayscale) |
|
denoised = remove_noise(img) |
|
self.show_image("Denoised", denoised) |
|
# threshold = thresholding(img) |
|
# self.show_image("Threshold", threshold) |
|
dilated = dilate(img) |
|
self.show_image("Dilated", dilated) |
|
eroded = erode(img) |
|
self.show_image("Eroded", eroded) |
|
# cv2img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) |
|
# cv2.imshow("Image", get_grayscale(cv2img)) |
|
|
|
def show_image (self, name, img): |
|
cv2.imshow(name, img) |
|
cv2.waitKey(0) |
|
cv2.destroyAllWindows() |
|
|
|
|
|
if __name__ == "__main__": |
|
file_path = os.path.join("../pdfs/T1/Verdi 106.pdf") |
|
parser = ImageParser(file_path) |
|
parser.preprocess() |
|
print(parser.text) |
|
i = 1 |
|
out_path = os.path.join("../images") |
|
for img in parser.images: |
|
cv2.imwrite(os.path.join(out_path, "test-%s.png" % i), img)
|
|
|