You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
159 lines
4.3 KiB
159 lines
4.3 KiB
![]()
1 year ago
|
# BUILT-INS
|
||
|
import os
|
||
|
import re
|
||
|
from PIL import Image
|
||
|
|
||
|
# VENDOR
|
||
|
import cv2
|
||
|
import numpy as np
|
||
|
import pytesseract
|
||
|
import pdf2image
|
||
|
|
||
|
|
||
|
def get_grayscale (img):
|
||
|
return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||
|
|
||
|
|
||
|
def binarize (img):
|
||
|
return cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
|
||
|
|
||
|
|
||
|
def remove_noise (img):
|
||
|
return cv2.medianBlur(img, 7)
|
||
|
|
||
|
|
||
|
def thresholding (img):
|
||
|
return cv2.threshold(img, .0, 255., cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
|
||
|
|
||
|
|
||
|
def dilate (img):
|
||
|
kernel = np.ones((3, 3), np.uint8)
|
||
|
return cv2.dilate(img, kernel, iterations=1)
|
||
|
|
||
|
|
||
|
def erode (img):
|
||
|
kernel = np.ones((1, 1), np.uint8)
|
||
|
return cv2.erode(img, kernel, iterations=1)
|
||
|
|
||
|
|
||
|
def opening (img):
|
||
|
kernel = np.ones((3, 3), np.uint8)
|
||
|
return cv2.morphologyEx(img, cv2.MORPH_OPEN, kernel)
|
||
|
|
||
|
|
||
|
def canny (img):
|
||
|
return cv2.Canny(img, 100, 200)
|
||
|
|
||
|
|
||
|
def deskew (img):
|
||
|
coords = np.column_stack(np.where(img > 0))
|
||
|
angle = cv2.minAreaRect(coords)[1]
|
||
|
if angle < 45:
|
||
|
angle = -(90 + angle)
|
||
|
else:
|
||
|
angle = -angle
|
||
|
(h, w) = img.shape[:2]
|
||
|
center = (w // 2, h // 2)
|
||
|
M = cv2.getRotationMatrix2D(center, angle, 1.)
|
||
|
rotated = cv2.wrapAffine(img, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
|
||
|
return rotated
|
||
|
|
||
|
|
||
|
def match_template (img, template):
|
||
|
return cv2.matchTemplate(img, template, cv2.TM_CCOEFF_NORMED)
|
||
|
|
||
|
|
||
|
def get_rotation (img):
|
||
|
osd = pytesseract.image_to_osd(img)
|
||
|
angle = re.search(r"(?<=Rotate: )\d+", osd)
|
||
|
return angle
|
||
|
|
||
|
|
||
|
def pdf_to_images (file_path):
|
||
|
directory = os.path.relpath(os.path.join(os.path.abspath(os.path.dirname(__file__)), "../images"))
|
||
|
subdirectory = os.path.join(directory, os.path.basename(os.path.splitext(file_path)[0]))
|
||
|
if os.path.isdir(subdirectory):
|
||
|
for file_name in os.listdir(subdirectory):
|
||
|
os.remove(os.path.join(subdirectory, file_name))
|
||
|
os.rmdir(subdirectory)
|
||
|
|
||
|
os.mkdir(subdirectory)
|
||
|
|
||
|
imgs = pdf2image.convert_from_path(file_path, dpi=300)
|
||
|
img_paths = list()
|
||
|
for i, img in enumerate(imgs):
|
||
|
img_path = os.path.join(subdirectory, f"{i}.png")
|
||
|
img.save(img_path)
|
||
|
img_paths.append(img_path)
|
||
|
|
||
|
return img_paths
|
||
|
|
||
|
|
||
|
class ImageParser (object):
|
||
|
|
||
|
def __init__(self, file_path):
|
||
|
if not file_path or type(file_path) != str:
|
||
|
raise ValueError("file_path arguments is not a valid type")
|
||
|
elif not os.path.isfile(file_path):
|
||
|
raise FileExistsError("Can't find nothing at the end of the path")
|
||
|
|
||
|
self.file_path = file_path
|
||
|
self.file_name = os.path.basename(file_path)
|
||
|
self.images = [cv2.imread(img_path) for img_path in pdf_to_images(file_path)]
|
||
|
self.preprocess()
|
||
|
|
||
|
@property
|
||
|
def text (self):
|
||
|
text = ""
|
||
|
for img in self.images:
|
||
|
text += "\n" + re.sub(r"(\n+| +)", " ", pytesseract.image_to_string(img, lang="spa"))
|
||
|
|
||
|
return text
|
||
|
|
||
|
def preprocess (self):
|
||
|
preprocessed = []
|
||
|
for img in self.images:
|
||
|
img = get_grayscale(img)
|
||
|
img = remove_noise(img)
|
||
|
img = binarize(img)
|
||
|
img = opening(img)
|
||
|
# img = erode(img)
|
||
|
# img = dilate(img)
|
||
|
# self.show_image("Test", img)
|
||
|
|
||
|
preprocessed.append(img)
|
||
|
|
||
|
self.images = preprocessed
|
||
|
|
||
|
return
|
||
|
# print(get_rotation(img))
|
||
|
deskewed = deskew(img)
|
||
|
self.show_image("Deskewed", deskewed)
|
||
|
self.show_image("Gray Scale", grayscale)
|
||
|
denoised = remove_noise(img)
|
||
|
self.show_image("Denoised", denoised)
|
||
|
# threshold = thresholding(img)
|
||
|
# self.show_image("Threshold", threshold)
|
||
|
dilated = dilate(img)
|
||
|
self.show_image("Dilated", dilated)
|
||
|
eroded = erode(img)
|
||
|
self.show_image("Eroded", eroded)
|
||
|
# cv2img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
|
||
|
# cv2.imshow("Image", get_grayscale(cv2img))
|
||
|
|
||
|
def show_image (self, name, img):
|
||
|
cv2.imshow(name, img)
|
||
|
cv2.waitKey(0)
|
||
|
cv2.destroyAllWindows()
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
file_path = os.path.join("../pdfs/T1/Verdi 106.pdf")
|
||
|
parser = ImageParser(file_path)
|
||
|
parser.preprocess()
|
||
|
print(parser.text)
|
||
|
i = 1
|
||
|
out_path = os.path.join("../images")
|
||
|
for img in parser.images:
|
||
|
cv2.imwrite(os.path.join(out_path, "test-%s.png" % i), img)
|