
commit
ec5e968eb0
14 changed files with 1120 additions and 0 deletions
@ -0,0 +1,33 @@
|
||||
# BUILT-INS |
||||
import json |
||||
|
||||
# SOURCE |
||||
from parsers import PdfParser |
||||
from parsers.geocoding import GeoLocation |
||||
|
||||
|
||||
if __name__ == "__main__": |
||||
import os |
||||
|
||||
directory = os.path.realpath(os.path.join(os.path.abspath(os.path.dirname(__file__)), "pdfs/T1")) |
||||
with open("log.txt", "wt") as conn: |
||||
for file_name in os.listdir(directory): |
||||
file_path = os.path.join(directory, file_name) |
||||
parser = PdfParser(file_path) |
||||
GeoLocation(parser) |
||||
format = parser.format |
||||
data = parser.data |
||||
conn.write(f""" |
||||
# FILENAME |
||||
{file_name.upper()} |
||||
{format == "img" and "Format imàtge" or "Format vectorial"} {not data.success and "(UNABLE TO READ)" or ""} |
||||
|
||||
## PLAIN_TEXT |
||||
{data} |
||||
## STRUCTURED |
||||
description: {json.dumps(data.description.data, indent=4, ensure_ascii=False)} |
||||
ownership: {json.dumps(data.ownership.data, indent=4, ensure_ascii=False)} |
||||
|
||||
## GEOLOCATION |
||||
{json.dumps(parser.geolocation.data, indent=4, ensure_ascii=False)} |
||||
""") |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,200 @@
|
||||
import geocoder |
||||
import re |
||||
import json |
||||
|
||||
|
||||
def parse_number (chars): |
||||
if not chars: |
||||
return "" |
||||
|
||||
chars = chars.lower() |
||||
try: |
||||
return int(chars) |
||||
except Exception as e: |
||||
decimals = { |
||||
"uno": 1, |
||||
"un": 1, |
||||
"dos": 2, |
||||
"tres": 3, |
||||
"cuatro": 4, |
||||
"quatre": 4, |
||||
"cinco": 5, |
||||
"cinc": 5, |
||||
"seis": 6, |
||||
"sis": 6, |
||||
"siete": 7, |
||||
"set": 7, |
||||
"ocho": 8, |
||||
"vuit": 8, |
||||
"nueve": 9, |
||||
"nou": 9 |
||||
} |
||||
|
||||
teens = { |
||||
"once": 11, |
||||
"onze": 11, |
||||
"doce": 12, |
||||
"dotze": 12, |
||||
"trece": 13, |
||||
"tretze": 13, |
||||
"catorce": 14, |
||||
"catorze": 14, |
||||
"quince": 15, |
||||
"quinze": 15, |
||||
"dieziseis": 16, |
||||
"setze": 16, |
||||
"diezisiete": 17, |
||||
"diset": 17, |
||||
"dieziocho": 18, |
||||
"divuit": 18, |
||||
"diezinueve": 19, |
||||
"dinou": 19 |
||||
} |
||||
|
||||
tenths = { |
||||
"diez": 10, |
||||
"deu": 10, |
||||
"veinte": 20, |
||||
"veint": 20, |
||||
"vint": 20, |
||||
"treinta": 30, |
||||
"trenta": 30, |
||||
"cuarenta": 40, |
||||
"quaranta": 40, |
||||
"cincuenta": 50, |
||||
"cinquanta": 50, |
||||
"sesenta": 60, |
||||
"seixanta": 60, |
||||
"setenta": 70, |
||||
"ochenta": 80, |
||||
"vuitanta": 80, |
||||
"noventa": 90, |
||||
"noranta": 90, |
||||
} |
||||
|
||||
hundreds = { |
||||
"cien": 100, |
||||
"cent": 100, |
||||
} |
||||
|
||||
thousands = { |
||||
"mil": 1000 |
||||
} |
||||
|
||||
number = 0 |
||||
for decimal in decimals: |
||||
match = re.search(r"(?<![a-z])(y|i|-)? *(%s)(?![a-z])" % decimal, chars) |
||||
if match: |
||||
number += decimals.get(match.groups()[1]) |
||||
|
||||
for teen in teens: |
||||
match = re.search(r"(?<=[a-z])(%s)(?![a-z])" % teen, chars) |
||||
if match: |
||||
number += teens.get(match.groups()[0]) |
||||
|
||||
for tenth in tenths: |
||||
match = re.search(r"(?<![a-z])(%s) *(?:y|i|-)? *([a-z]+)?(?![a-z])? *" % tenth, chars) |
||||
if match: |
||||
number += tenths.get(match.groups()[0]) |
||||
|
||||
for hundred in hundreds: |
||||
match = re.search(r"(?<![a-z])([a-z]+)?%s(?:tos?|s?)?(?![a-z])" % hundred, chars) |
||||
if match: |
||||
number += (decimals.get(match.groups()[0])) or 1 * 100 |
||||
|
||||
for thousand in thousands: |
||||
match = re.search(r"(?<![a-z])([a-z]+)?%s(?![a-z])" % thousand, chars) |
||||
if match: |
||||
print("match thousands") |
||||
number += (decimals.get(match.groups()[0])) or 1 * 1000 |
||||
|
||||
return number |
||||
|
||||
|
||||
def parse_street (chars): |
||||
if not chars: |
||||
return "" |
||||
|
||||
street = " ".join([chunk.lower() for word in chars.split(" ") for chunk in re.split(r"([A-Z][a-z]+)", word) if chunk]) |
||||
return street |
||||
|
||||
|
||||
def parse_town (chars): |
||||
if not chars: |
||||
return "" |
||||
|
||||
town = " ".join([chunk.lower() for word in chars.split(" ") for chunk in re.split(r"([A-Z][a-z]+)", word) if chunk]) |
||||
town = re.sub(r"(villa|vila)( *de *)?", "", town) |
||||
return town |
||||
|
||||
|
||||
def build_address (record): |
||||
data = record.description.data |
||||
address = "{number} {street}, {town}, cataluña, españa".format( |
||||
number=parse_number(data.get("number")), |
||||
street=parse_street(data.get("street")), |
||||
town=parse_town(data.get("town")) |
||||
) |
||||
|
||||
address = re.sub(r"^ *, *", "", re.sub(r"(?<=,) *,", "", address)) |
||||
return address |
||||
|
||||
|
||||
class GeoLocation (object): |
||||
|
||||
def __init__ (self, parser): |
||||
self._data = self.geolocate(parser.data).json or {} |
||||
setattr(parser, "geolocation", self) |
||||
|
||||
def geolocate (self, record): |
||||
address = build_address(record) |
||||
res = geocoder.osm(address) |
||||
return res |
||||
|
||||
@property |
||||
def latlng (self): |
||||
return [self._data.get("lat"), self._data.get("lng")] |
||||
|
||||
@property |
||||
def address (self): |
||||
return self._data.get("address") |
||||
|
||||
@property |
||||
def street (self): |
||||
return self._data.get("street") |
||||
|
||||
@property |
||||
def postcode (self): |
||||
return self._data.get("postcode") |
||||
|
||||
@property |
||||
def town (self): |
||||
return self._data.get("town") |
||||
|
||||
@property |
||||
def county (self): |
||||
return self._data.get("county") |
||||
|
||||
@property |
||||
def region (self): |
||||
return self._data.get("region") |
||||
|
||||
@property |
||||
def country (self): |
||||
return self._data.get("country") |
||||
|
||||
@property |
||||
def data (self): |
||||
return { |
||||
"latlng": self.latlng, |
||||
"street": self.street, |
||||
"postcode": self.postcode, |
||||
"town": self.town, |
||||
"county": self.county, |
||||
"region": self.region, |
||||
"country": self.country, |
||||
"address": self.address |
||||
} |
||||
|
||||
def __str__ (self): |
||||
return json.dumps(self.data) |
@ -0,0 +1,158 @@
|
||||
# BUILT-INS |
||||
import os |
||||
import re |
||||
from PIL import Image |
||||
|
||||
# VENDOR |
||||
import cv2 |
||||
import numpy as np |
||||
import pytesseract |
||||
import pdf2image |
||||
|
||||
|
||||
def get_grayscale (img): |
||||
return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) |
||||
|
||||
|
||||
def binarize (img): |
||||
return cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2) |
||||
|
||||
|
||||
def remove_noise (img): |
||||
return cv2.medianBlur(img, 7) |
||||
|
||||
|
||||
def thresholding (img): |
||||
return cv2.threshold(img, .0, 255., cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] |
||||
|
||||
|
||||
def dilate (img): |
||||
kernel = np.ones((3, 3), np.uint8) |
||||
return cv2.dilate(img, kernel, iterations=1) |
||||
|
||||
|
||||
def erode (img): |
||||
kernel = np.ones((1, 1), np.uint8) |
||||
return cv2.erode(img, kernel, iterations=1) |
||||
|
||||
|
||||
def opening (img): |
||||
kernel = np.ones((3, 3), np.uint8) |
||||
return cv2.morphologyEx(img, cv2.MORPH_OPEN, kernel) |
||||
|
||||
|
||||
def canny (img): |
||||
return cv2.Canny(img, 100, 200) |
||||
|
||||
|
||||
def deskew (img): |
||||
coords = np.column_stack(np.where(img > 0)) |
||||
angle = cv2.minAreaRect(coords)[1] |
||||
if angle < 45: |
||||
angle = -(90 + angle) |
||||
else: |
||||
angle = -angle |
||||
(h, w) = img.shape[:2] |
||||
center = (w // 2, h // 2) |
||||
M = cv2.getRotationMatrix2D(center, angle, 1.) |
||||
rotated = cv2.wrapAffine(img, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE) |
||||
return rotated |
||||
|
||||
|
||||
def match_template (img, template): |
||||
return cv2.matchTemplate(img, template, cv2.TM_CCOEFF_NORMED) |
||||
|
||||
|
||||
def get_rotation (img): |
||||
osd = pytesseract.image_to_osd(img) |
||||
angle = re.search(r"(?<=Rotate: )\d+", osd) |
||||
return angle |
||||
|
||||
|
||||
def pdf_to_images (file_path): |
||||
directory = os.path.relpath(os.path.join(os.path.abspath(os.path.dirname(__file__)), "../images")) |
||||
subdirectory = os.path.join(directory, os.path.basename(os.path.splitext(file_path)[0])) |
||||
if os.path.isdir(subdirectory): |
||||
for file_name in os.listdir(subdirectory): |
||||
os.remove(os.path.join(subdirectory, file_name)) |
||||
os.rmdir(subdirectory) |
||||
|
||||
os.mkdir(subdirectory) |
||||
|
||||
imgs = pdf2image.convert_from_path(file_path, dpi=300) |
||||
img_paths = list() |
||||
for i, img in enumerate(imgs): |
||||
img_path = os.path.join(subdirectory, f"{i}.png") |
||||
img.save(img_path) |
||||
img_paths.append(img_path) |
||||
|
||||
return img_paths |
||||
|
||||
|
||||
class ImageParser (object): |
||||
|
||||
def __init__(self, file_path): |
||||
if not file_path or type(file_path) != str: |
||||
raise ValueError("file_path arguments is not a valid type") |
||||
elif not os.path.isfile(file_path): |
||||
raise FileExistsError("Can't find nothing at the end of the path") |
||||
|
||||
self.file_path = file_path |
||||
self.file_name = os.path.basename(file_path) |
||||
self.images = [cv2.imread(img_path) for img_path in pdf_to_images(file_path)] |
||||
self.preprocess() |
||||
|
||||
@property |
||||
def text (self): |
||||
text = "" |
||||
for img in self.images: |
||||
text += "\n" + re.sub(r"(\n+| +)", " ", pytesseract.image_to_string(img, lang="spa")) |
||||
|
||||
return text |
||||
|
||||
def preprocess (self): |
||||
preprocessed = [] |
||||
for img in self.images: |
||||
img = get_grayscale(img) |
||||
img = remove_noise(img) |
||||
img = binarize(img) |
||||
img = opening(img) |
||||
# img = erode(img) |
||||
# img = dilate(img) |
||||
# self.show_image("Test", img) |
||||
|
||||
preprocessed.append(img) |
||||
|
||||
self.images = preprocessed |
||||
|
||||
return |
||||
# print(get_rotation(img)) |
||||
deskewed = deskew(img) |
||||
self.show_image("Deskewed", deskewed) |
||||
self.show_image("Gray Scale", grayscale) |
||||
denoised = remove_noise(img) |
||||
self.show_image("Denoised", denoised) |
||||
# threshold = thresholding(img) |
||||
# self.show_image("Threshold", threshold) |
||||
dilated = dilate(img) |
||||
self.show_image("Dilated", dilated) |
||||
eroded = erode(img) |
||||
self.show_image("Eroded", eroded) |
||||
# cv2img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) |
||||
# cv2.imshow("Image", get_grayscale(cv2img)) |
||||
|
||||
def show_image (self, name, img): |
||||
cv2.imshow(name, img) |
||||
cv2.waitKey(0) |
||||
cv2.destroyAllWindows() |
||||
|
||||
|
||||
if __name__ == "__main__": |
||||
file_path = os.path.join("../pdfs/T1/Verdi 106.pdf") |
||||
parser = ImageParser(file_path) |
||||
parser.preprocess() |
||||
print(parser.text) |
||||
i = 1 |
||||
out_path = os.path.join("../images") |
||||
for img in parser.images: |
||||
cv2.imwrite(os.path.join(out_path, "test-%s.png" % i), img) |
@ -0,0 +1,59 @@
|
||||
# BUILT-INS |
||||
import os |
||||
|
||||
# VENDOR |
||||
from PyPDF2 import PdfFileReader |
||||
|
||||
# SOURCE |
||||
from .text import TextParser |
||||
from .image import ImageParser |
||||
|
||||
|
||||
class PdfParser (object): |
||||
|
||||
def __init__ (self, file_path): |
||||
if not file_path or type(file_path) != str: |
||||
raise ValueError("file_path arguments is not a valid type") |
||||
elif not os.path.isfile(file_path): |
||||
raise FileExistsError("Can't find nothing at the end of the path") |
||||
|
||||
self.file_path = file_path |
||||
self.file_name = os.path.basename(file_path) |
||||
self.conn = open(file_path, "rb") |
||||
self.parser = PdfFileReader(self.conn) |
||||
self.pages = [self.parser.getPage(i) for i in range(self.parser.getNumPages())] |
||||
|
||||
@property |
||||
def format (self): |
||||
has_text = bool(self.pages[0].extractText()) |
||||
if has_text: |
||||
return "str" |
||||
else: |
||||
return "img" |
||||
|
||||
@property |
||||
def data (self): |
||||
if self.format == "str": |
||||
text = "" |
||||
for page in self.pages: |
||||
text += "\n" + page.extractText() |
||||
|
||||
return TextParser(text) |
||||
else: |
||||
return TextParser(ImageParser(self.file_path).text) |
||||
|
||||
def __str__ (self): |
||||
return self.data.text |
||||
|
||||
def __del__ (self): |
||||
self.conn.close() |
||||
|
||||
|
||||
if __name__ == "__main__": |
||||
directory = os.path.relpath(os.path.join(os.path.abspath(os.path.dirname(__file__)), "../pdfs/T1")) |
||||
for file_name in os.listdir(directory): |
||||
file_path = os.path.join(directory, file_name) |
||||
parser = PdfParser(file_path) |
||||
print(file_name.upper()) |
||||
print(parser.data.ownership.data) |
||||
print() |
@ -0,0 +1,345 @@
|
||||
# BUILT-INS |
||||
import re |
||||
from unidecode import unidecode |
||||
|
||||
|
||||
class Parser (object): |
||||
__text = None |
||||
|
||||
def __init__ (self, text=""): |
||||
self.text = text |
||||
|
||||
def __str__ (self): |
||||
return self.text |
||||
|
||||
def __add__ (self, other): |
||||
if other: |
||||
if type(other) == type(self): |
||||
return self.__class__(self.text + "\n" + other.text) |
||||
elif type(other) == str: |
||||
return self.__class__.match(self.text + "\n" + other) |
||||
|
||||
return self |
||||
|
||||
def __radd__ (self, other): |
||||
if other: |
||||
if type(other) == type(self): |
||||
return self.__class__(other.text + "\n" + self.text) |
||||
elif type(other) == str: |
||||
return self.__class__.match(other + "\n" + self.text) |
||||
|
||||
return self |
||||
|
||||
def __len__ (self): |
||||
return len(self.text) |
||||
|
||||
@staticmethod |
||||
def search (pattern, string): |
||||
return re.search(pattern, unidecode.unidecode(string), flags=re.IGNORECASE) |
||||
|
||||
@staticmethod |
||||
def sub (pattern, string, replace): |
||||
return re.sub(pattern, unidecode(string), replace, count=0, flags=re.IGNORECASE) |
||||
|
||||
@property |
||||
def text (self): |
||||
return self.__text or "" |
||||
|
||||
@text.setter |
||||
def text (self, text): |
||||
if text and type(text) == str: |
||||
if self.__text: |
||||
self.__text += "\n" + text |
||||
else: |
||||
self.__text = text |
||||
|
||||
@property |
||||
def data (self): |
||||
return self.text |
||||
|
||||
|
||||
class CoverParser (Parser): |
||||
|
||||
@staticmethod |
||||
def match (text): |
||||
match = re.search(r"((?!DESCRIPCION).)+", text, flags=re.IGNORECASE) |
||||
if match: |
||||
cover_content = re.sub(r"DESCRIPCION.*", "", match.group(), count=0, flags=re.IGNORECASE) |
||||
return CoverParser(cover_content) |
||||
|
||||
|
||||
class DescriptionParser (Parser): |
||||
|
||||
@staticmethod |
||||
def match (text): |
||||
match = Parser.search(r"DESCRIPCION *\: *((?!(TITULO|TITULARES|TITULARIDADES)).)+", text) |
||||
if match: |
||||
description_body = re.sub(r"^DESCRIPCION\:? *", "", re.sub(r"€+", " ", match.group())=0, flags=re.IGNORECASE) |
||||
return DescriptionParser(description_body) |
||||
|
||||
@property |
||||
def data (self): |
||||
data = dict() |
||||
data["qualification"] = self.qualification |
||||
data["type"] = self.type |
||||
data["town"] = self.town |
||||
data["street"] = self.street |
||||
data["number"] = self.number |
||||
data["surface"] = self.surface |
||||
return data |
||||
|
||||
@property |
||||
@clean_match |
||||
def qualification (self): |
||||
match = re.search(r"(RUSTICA|URBANA)", self.text) |
||||
if match: |
||||
return match.group() |
||||
|
||||
type = self.type |
||||
type = type and type.lower() or type |
||||
if type and "terreno" not in type: |
||||
return "URBANA" |
||||
|
||||
@property |
||||
@clean_match |
||||
def type (self): |
||||
match = re.search(r"(PORCION DE TERRENO|VIVIENDA|CASA)", self.text) |
||||
if match: |
||||
return match.group() |
||||
|
||||
@property |
||||
@clean_match |
||||
def town (self): |
||||
match = re.search(r"(?:sita|sito|situado|situada) en (((?!(?:\.|\,|\;)).)+)", self.text) |
||||
if match: |
||||
return re.sub(r" *calle.*", "", match.groups()[0]) |
||||
|
||||
match = re.search(r"(?:PORCION DE TERRENO|VIVIENDA) en (((?!(?:\.|\,|\;)).)+)", self.text) |
||||
if match: |
||||
return re.sub(r" *calle.*", "", match.groups()[0]) |
||||
|
||||
@property |
||||
@clean_match |
||||
def street (self): |
||||
if self.qualification and self.qualification.lower() == "rustica": |
||||
return None |
||||
|
||||
match = re.search(r"(calle|plaza|camino) *(((?!,).)+)", self.text) |
||||
if match: |
||||
return match.groups()[0] + " " + re.sub(r" *número.*", "", match.groups()[1]) |
||||
|
||||
@property |
||||
@clean_match |
||||
def number (self): |
||||
if self.qualification and self.qualification.lower() == "rustica": |
||||
return None |
||||
|
||||
match = re.search(r"(?:número) * (((?!(?:\.|\,|\;)).)+)", self.text) |
||||
if match: |
||||
return match.groups()[0] |
||||
|
||||
@property |
||||
@clean_match |
||||
def surface (self): |
||||
match = re.search(r"superficie *(?:total)? *(?:construida|edificada)? * (?:de)? *(((?!(?:\.|\,|\;)).)+)", self.text) |
||||
if match: |
||||
return match.groups()[0] |
||||
|
||||
|
||||
class OwnershipParser (Parser): |
||||
|
||||
@staticmethod |
||||
def match (text): |
||||
match = re.search(r"TITULO *\: *((?!CARGAS).)+", text) |
||||
if match: |
||||
ownership_body = re.sub(r"^TITULO *\: *", "", re.sub(r"€+", " ", match.group())) |
||||
return OwnershipParser(ownership_body) |
||||
|
||||
@property |
||||
def data (self): |
||||
data = dict() |
||||
data["owner"] = self.owner |
||||
data["nie"] = self.nie |
||||
data["participation"] = self.participation |
||||
data["adjudication"] = self.adjudication |
||||
data["notary"] = self.notary |
||||
data["town"] = self.town |
||||
data["date"] = self.date |
||||
return data |
||||
|
||||
@property |
||||
# @clean_match |
||||
def owner (self): |
||||
match = re.search(r"Titular\/es\:? *(((?!,).)+)", self.text) |
||||
if match: |
||||
return match.groups()[0].strip() |
||||
|
||||
@property |
||||
# @clean_match |
||||
def nie (self): |
||||
match = re.search(r"(DNI|CIF).* ([a-zA-Z]?[0-9]{8}[a-zA-Z]?) ", self.text) |
||||
if match: |
||||
return match.groups()[1].strip() |
||||
|
||||
@property |
||||
# @clean_match |
||||
def participation (self): |
||||
match = re.search(r"Participación *\: *(((?!Título).)+)", self.text) |
||||
if match: |
||||
return match.groups()[0].strip() |
||||
|
||||
@property |
||||
@clean_match |
||||
def adjudication (self): |
||||
match = re.search(r"Título *: *(((?!Notario).)+)", self.text) |
||||
if match: |
||||
return match.groups()[0].strip() |
||||
|
||||
@property |
||||
@clean_match |
||||
def notary (self): |
||||
match = re.search(r"Notario(?:\/Autoridad)? *: *(((?!Población).)+) ", self.text) |
||||
if match: |
||||
return match.groups()[0].strip() |
||||
|
||||
@property |
||||
@clean_match |
||||
def town (self): |
||||
match = re.search(r"Población *: *(((?!Fecha).)+)", self.text) |
||||
if match: |
||||
return match.groups()[0].strip() |
||||
|
||||
@property |
||||
@clean_match |
||||
def date (self): |
||||
match = re.search(r"Fecha (?:documento|escritura) *: *((?!Protocolo)[0-9]{2}\/[0-9]{2}\/[0-9]{4})", self.text) |
||||
if match: |
||||
return match.groups()[0].strip() |
||||
|
||||
|
||||
class ChargesParser (Parser): |
||||
|
||||
@staticmethod |
||||
def match (text): |
||||
match = re.search(r"CARGAS *\: *((?!PRESENTACION).)+", text) |
||||
if match: |
||||
charges_body = re.sub(r"^CARGAS\: *", "", re.sub("€+", " ", match.group())) |
||||
return ChargesParser(charges_body) |
||||
|
||||
# def get_presentation (text): |
||||
# match = re.search(r"(PRESENTACION\:)?((?!-+).)+", text) |
||||
# if match: |
||||
# return re.sub("^PRESENTACION\: *", "", re.sub(r"€+", " ", match.group())) |
||||
|
||||
|
||||
class TextParser (object): |
||||
|
||||
__cover = CoverParser() |
||||
__description = DescriptionParser() |
||||
__ownership = OwnershipParser() |
||||
__charges = ChargesParser() |
||||
|
||||
def __init__ (self, text): |
||||
self.cover = text |
||||
text = secure_replace(self.cover.text, "", text) |
||||
self.description = text |
||||
text = secure_replace(self.cover.text, "", text) |
||||
self.ownership = text |
||||
text = secure_replace(self.ownership.text, "", text) |
||||
self.charges = text |
||||
|
||||
@property |
||||
def cover (self): |
||||
return self.__cover |
||||
|
||||
@cover.setter |
||||
def cover (self, cover): |
||||
if cover: |
||||
if type(cover) == CoverParser: |
||||
self.__cover = self.cover + cover |
||||
elif type(cover) == str: |
||||
self.__cover = self.cover + CoverParser.match(cover) |
||||
else: |
||||
pass |
||||
raise TypeError("Description property accepts string type values. " + type(cover) + " was found.") |
||||
|
||||
@property |
||||
def description (self): |
||||
return self.__description |
||||
|
||||
@description.setter |
||||
def description (self, description): |
||||
if description: |
||||
if type(description) == DescriptionParser: |
||||
self.__description = self.description + description |
||||
elif type(description) == str: |
||||
self.__description = self.description + DescriptionParser.match(description) |
||||
else: |
||||
pass |
||||
raise TypeError("Description property accepts string type values. " + type(description) + " was found.") |
||||
|
||||
@property |
||||
def ownership (self): |
||||
return self.__ownership or OwnershipParser() |
||||
|
||||
@ownership.setter |
||||
def ownership (self, ownership): |
||||
if ownership: |
||||
if type(ownership) == OwnershipParser: |
||||
self.__ownership = self.ownership + ownership |
||||
elif type(ownership) == str: |
||||
self.__ownership = self.ownership + OwnershipParser.match(ownership) |
||||
else: |
||||
pass |
||||
raise TypeError("Ownership property accepts string type value. " + type(ownership) + " was found.") |
||||
|
||||
@property |
||||
def charges (self): |
||||
return self.__charges or ChargesParser() |
||||
|
||||
@charges.setter |
||||
def charges (self, charges): |
||||
if charges: |
||||
if type(charges) == ChargesParser: |
||||
self.__charges = self.charges + charges |
||||
elif type(charges) == str: |
||||
self.__charges = self.charges + ChargesParser.match(charges) |
||||
else: |
||||
pass |
||||
raise TypeError("Charges property accepts string type value. " + type(charges) + " was found.") |
||||
|
||||
@property |
||||
def text (self): |
||||
return "Description:\r" \ |
||||
+ f"{self.description}\n" \ |
||||
+ "Ownership:\r" \ |
||||
+ f"{self.ownership}\n" \ |
||||
+ "Charges:\r" \ |
||||
+ f"{self.charges}\n" |
||||
|
||||
@property |
||||
def success (self): |
||||
return len(self.description) or len(self.charges) or len(self.ownership) |
||||
|
||||
def __str__ (self): |
||||
return self.text |
||||
|
||||
|
||||
def secure_replace (pattern, replace, string): |
||||
try: |
||||
return re.sub(pattern, replace, string) |
||||
except Exception as e: |
||||
return string |
||||
|
||||
|
||||
if __name__ == "__main__": |
||||
import os |
||||
|
||||
directory = os.path.relpath(os.path.join(os.path.abspath(os.path.dirname(__file__)), "../pdfs/T1")) |
||||
for file_name in os.listdir(directory): |
||||
file_path = os.path.join(directory, file_name) |
||||
parser = TextParser(file_path) |
||||
print(file_name.upper()) |
||||
print(parser.description) |
||||
print(parser.description.data) |
||||
print() |
Loading…
Reference in new issue