You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

345 lines
10 KiB

1 year ago
# BUILT-INS
import re
from unidecode import unidecode
class Parser (object):
__text = None
def __init__ (self, text=""):
self.text = text
def __str__ (self):
return self.text
def __add__ (self, other):
if other:
if type(other) == type(self):
return self.__class__(self.text + "\n" + other.text)
elif type(other) == str:
return self.__class__.match(self.text + "\n" + other)
return self
def __radd__ (self, other):
if other:
if type(other) == type(self):
return self.__class__(other.text + "\n" + self.text)
elif type(other) == str:
return self.__class__.match(other + "\n" + self.text)
return self
def __len__ (self):
return len(self.text)
@staticmethod
def search (pattern, string):
return re.search(pattern, unidecode.unidecode(string), flags=re.IGNORECASE)
@staticmethod
def sub (pattern, string, replace):
return re.sub(pattern, unidecode(string), replace, count=0, flags=re.IGNORECASE)
@property
def text (self):
return self.__text or ""
@text.setter
def text (self, text):
if text and type(text) == str:
if self.__text:
self.__text += "\n" + text
else:
self.__text = text
@property
def data (self):
return self.text
class CoverParser (Parser):
@staticmethod
def match (text):
match = re.search(r"((?!DESCRIPCION).)+", text, flags=re.IGNORECASE)
if match:
cover_content = re.sub(r"DESCRIPCION.*", "", match.group(), count=0, flags=re.IGNORECASE)
return CoverParser(cover_content)
class DescriptionParser (Parser):
@staticmethod
def match (text):
match = Parser.search(r"DESCRIPCION *\: *((?!(TITULO|TITULARES|TITULARIDADES)).)+", text)
if match:
description_body = re.sub(r"^DESCRIPCION\:? *", "", re.sub(r"€+", " ", match.group())=0, flags=re.IGNORECASE)
return DescriptionParser(description_body)
@property
def data (self):
data = dict()
data["qualification"] = self.qualification
data["type"] = self.type
data["town"] = self.town
data["street"] = self.street
data["number"] = self.number
data["surface"] = self.surface
return data
@property
@clean_match
def qualification (self):
match = re.search(r"(RUSTICA|URBANA)", self.text)
if match:
return match.group()
type = self.type
type = type and type.lower() or type
if type and "terreno" not in type:
return "URBANA"
@property
@clean_match
def type (self):
match = re.search(r"(PORCION DE TERRENO|VIVIENDA|CASA)", self.text)
if match:
return match.group()
@property
@clean_match
def town (self):
match = re.search(r"(?:sita|sito|situado|situada) en (((?!(?:\.|\,|\;)).)+)", self.text)
if match:
return re.sub(r" *calle.*", "", match.groups()[0])
match = re.search(r"(?:PORCION DE TERRENO|VIVIENDA) en (((?!(?:\.|\,|\;)).)+)", self.text)
if match:
return re.sub(r" *calle.*", "", match.groups()[0])
@property
@clean_match
def street (self):
if self.qualification and self.qualification.lower() == "rustica":
return None
match = re.search(r"(calle|plaza|camino) *(((?!,).)+)", self.text)
if match:
return match.groups()[0] + " " + re.sub(r" *número.*", "", match.groups()[1])
@property
@clean_match
def number (self):
if self.qualification and self.qualification.lower() == "rustica":
return None
match = re.search(r"(?:número) * (((?!(?:\.|\,|\;)).)+)", self.text)
if match:
return match.groups()[0]
@property
@clean_match
def surface (self):
match = re.search(r"superficie *(?:total)? *(?:construida|edificada)? * (?:de)? *(((?!(?:\.|\,|\;)).)+)", self.text)
if match:
return match.groups()[0]
class OwnershipParser (Parser):
@staticmethod
def match (text):
match = re.search(r"TITULO *\: *((?!CARGAS).)+", text)
if match:
ownership_body = re.sub(r"^TITULO *\: *", "", re.sub(r"€+", " ", match.group()))
return OwnershipParser(ownership_body)
@property
def data (self):
data = dict()
data["owner"] = self.owner
data["nie"] = self.nie
data["participation"] = self.participation
data["adjudication"] = self.adjudication
data["notary"] = self.notary
data["town"] = self.town
data["date"] = self.date
return data
@property
# @clean_match
def owner (self):
match = re.search(r"Titular\/es\:? *(((?!,).)+)", self.text)
if match:
return match.groups()[0].strip()
@property
# @clean_match
def nie (self):
match = re.search(r"(DNI|CIF).* ([a-zA-Z]?[0-9]{8}[a-zA-Z]?) ", self.text)
if match:
return match.groups()[1].strip()
@property
# @clean_match
def participation (self):
match = re.search(r"Participación *\: *(((?!Título).)+)", self.text)
if match:
return match.groups()[0].strip()
@property
@clean_match
def adjudication (self):
match = re.search(r"Título *: *(((?!Notario).)+)", self.text)
if match:
return match.groups()[0].strip()
@property
@clean_match
def notary (self):
match = re.search(r"Notario(?:\/Autoridad)? *: *(((?!Población).)+) ", self.text)
if match:
return match.groups()[0].strip()
@property
@clean_match
def town (self):
match = re.search(r"Población *: *(((?!Fecha).)+)", self.text)
if match:
return match.groups()[0].strip()
@property
@clean_match
def date (self):
match = re.search(r"Fecha (?:documento|escritura) *: *((?!Protocolo)[0-9]{2}\/[0-9]{2}\/[0-9]{4})", self.text)
if match:
return match.groups()[0].strip()
class ChargesParser (Parser):
@staticmethod
def match (text):
match = re.search(r"CARGAS *\: *((?!PRESENTACION).)+", text)
if match:
charges_body = re.sub(r"^CARGAS\: *", "", re.sub("€+", " ", match.group()))
return ChargesParser(charges_body)
# def get_presentation (text):
# match = re.search(r"(PRESENTACION\:)?((?!-+).)+", text)
# if match:
# return re.sub("^PRESENTACION\: *", "", re.sub(r"€+", " ", match.group()))
class TextParser (object):
__cover = CoverParser()
__description = DescriptionParser()
__ownership = OwnershipParser()
__charges = ChargesParser()
def __init__ (self, text):
self.cover = text
text = secure_replace(self.cover.text, "", text)
self.description = text
text = secure_replace(self.cover.text, "", text)
self.ownership = text
text = secure_replace(self.ownership.text, "", text)
self.charges = text
@property
def cover (self):
return self.__cover
@cover.setter
def cover (self, cover):
if cover:
if type(cover) == CoverParser:
self.__cover = self.cover + cover
elif type(cover) == str:
self.__cover = self.cover + CoverParser.match(cover)
else:
pass
raise TypeError("Description property accepts string type values. " + type(cover) + " was found.")
@property
def description (self):
return self.__description
@description.setter
def description (self, description):
if description:
if type(description) == DescriptionParser:
self.__description = self.description + description
elif type(description) == str:
self.__description = self.description + DescriptionParser.match(description)
else:
pass
raise TypeError("Description property accepts string type values. " + type(description) + " was found.")
@property
def ownership (self):
return self.__ownership or OwnershipParser()
@ownership.setter
def ownership (self, ownership):
if ownership:
if type(ownership) == OwnershipParser:
self.__ownership = self.ownership + ownership
elif type(ownership) == str:
self.__ownership = self.ownership + OwnershipParser.match(ownership)
else:
pass
raise TypeError("Ownership property accepts string type value. " + type(ownership) + " was found.")
@property
def charges (self):
return self.__charges or ChargesParser()
@charges.setter
def charges (self, charges):
if charges:
if type(charges) == ChargesParser:
self.__charges = self.charges + charges
elif type(charges) == str:
self.__charges = self.charges + ChargesParser.match(charges)
else:
pass
raise TypeError("Charges property accepts string type value. " + type(charges) + " was found.")
@property
def text (self):
return "Description:\r" \
+ f"{self.description}\n" \
+ "Ownership:\r" \
+ f"{self.ownership}\n" \
+ "Charges:\r" \
+ f"{self.charges}\n"
@property
def success (self):
return len(self.description) or len(self.charges) or len(self.ownership)
def __str__ (self):
return self.text
def secure_replace (pattern, replace, string):
try:
return re.sub(pattern, replace, string)
except Exception as e:
return string
if __name__ == "__main__":
import os
directory = os.path.relpath(os.path.join(os.path.abspath(os.path.dirname(__file__)), "../pdfs/T1"))
for file_name in os.listdir(directory):
file_path = os.path.join(directory, file_name)
parser = TextParser(file_path)
print(file_name.upper())
print(parser.description)
print(parser.description.data)
print()