|
|
|
# BUILT-INS
|
|
|
|
import re
|
|
|
|
from unidecode import unidecode
|
|
|
|
|
|
|
|
|
|
|
|
class Parser (object):
|
|
|
|
__text = None
|
|
|
|
|
|
|
|
def __init__ (self, text=""):
|
|
|
|
self.text = text
|
|
|
|
|
|
|
|
def __str__ (self):
|
|
|
|
return self.text
|
|
|
|
|
|
|
|
def __add__ (self, other):
|
|
|
|
if other:
|
|
|
|
if type(other) == type(self):
|
|
|
|
return self.__class__(self.text + "\n" + other.text)
|
|
|
|
elif type(other) == str:
|
|
|
|
return self.__class__.match(self.text + "\n" + other)
|
|
|
|
|
|
|
|
return self
|
|
|
|
|
|
|
|
def __radd__ (self, other):
|
|
|
|
if other:
|
|
|
|
if type(other) == type(self):
|
|
|
|
return self.__class__(other.text + "\n" + self.text)
|
|
|
|
elif type(other) == str:
|
|
|
|
return self.__class__.match(other + "\n" + self.text)
|
|
|
|
|
|
|
|
return self
|
|
|
|
|
|
|
|
def __len__ (self):
|
|
|
|
return len(self.text)
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def clean_text (fn):
|
|
|
|
pass
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def search (pattern, string):
|
|
|
|
return re.search(pattern, unidecode(string), flags=re.IGNORECASE)
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def sub (pattern, string, replace):
|
|
|
|
return re.sub(pattern, unidecode(string), replace, count=0, flags=re.IGNORECASE)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def text (self):
|
|
|
|
return self.__text or ""
|
|
|
|
|
|
|
|
@text.setter
|
|
|
|
def text (self, text):
|
|
|
|
if text and type(text) == str:
|
|
|
|
if self.__text:
|
|
|
|
self.__text += "\n" + text
|
|
|
|
else:
|
|
|
|
self.__text = text
|
|
|
|
|
|
|
|
@property
|
|
|
|
def data (self):
|
|
|
|
return self.text
|
|
|
|
|
|
|
|
|
|
|
|
class CoverParser (Parser):
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def match (text):
|
|
|
|
match = Parser.search(r"((?!DESCRIPCION).)+", text)
|
|
|
|
if match:
|
|
|
|
cover_content = Parser.sub(r"DESCRIPCION.*", "", match.group())
|
|
|
|
return CoverParser(cover_content)
|
|
|
|
|
|
|
|
|
|
|
|
class DescriptionParser (Parser):
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def match (text):
|
|
|
|
match = Parser.search(r"DESCRIPCION *\: *((?!(TITULO|TITULARES|TITULARIDADES)).)+", text)
|
|
|
|
if match:
|
|
|
|
description_body = Parser.sub(r"^DESCRIPCION\:? *", "", Parser.sub(r"€+", " ", match.group()))
|
|
|
|
return DescriptionParser(description_body)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def data (self):
|
|
|
|
data = dict()
|
|
|
|
data["qualification"] = self.qualification
|
|
|
|
data["type"] = self.type
|
|
|
|
data["town"] = self.town
|
|
|
|
data["street"] = self.street
|
|
|
|
data["number"] = self.number
|
|
|
|
data["surface"] = self.surface
|
|
|
|
return data
|
|
|
|
|
|
|
|
@property
|
|
|
|
# @clean_match
|
|
|
|
def qualification (self):
|
|
|
|
match = Parser.search(r"(RUSTICA|URBANA)", self.text)
|
|
|
|
if match:
|
|
|
|
return match.group()
|
|
|
|
|
|
|
|
type = self.type
|
|
|
|
type = type and type.lower() or type
|
|
|
|
if type and "terreno" not in type:
|
|
|
|
return "URBANA"
|
|
|
|
|
|
|
|
@property
|
|
|
|
# @clean_match
|
|
|
|
def type (self):
|
|
|
|
match = Parser.search(r"(PORCION DE TERRENO|VIVIENDA|CASA)", self.text)
|
|
|
|
if match:
|
|
|
|
return match.group()
|
|
|
|
|
|
|
|
@property
|
|
|
|
# @clean_match
|
|
|
|
def town (self):
|
|
|
|
match = Parser.search(r"(?:sita|sito|situado|situada) en (((?!(?:\.|\,|\;)).)+)", self.text)
|
|
|
|
if match:
|
|
|
|
return Parser.sub(r" *calle.*", "", match.groups()[0])
|
|
|
|
|
|
|
|
match = Parser.search(r"(?:PORCION DE TERRENO|VIVIENDA) en (((?!(?:\.|\,|\;)).)+)", self.text)
|
|
|
|
if match:
|
|
|
|
return Parser.sub(r" *calle.*", "", match.groups()[0])
|
|
|
|
|
|
|
|
@property
|
|
|
|
# @clean_match
|
|
|
|
def street (self):
|
|
|
|
if self.qualification and self.qualification.lower() == "rustica":
|
|
|
|
return None
|
|
|
|
|
|
|
|
match = Parser.search(r"(calle|plaza|camino) *(((?!,).)+)", self.text)
|
|
|
|
if match:
|
|
|
|
return match.groups()[0] + " " + Parser.sub(r" *numero.*", "", match.groups()[1])
|
|
|
|
|
|
|
|
@property
|
|
|
|
# @clean_match
|
|
|
|
def number (self):
|
|
|
|
if self.qualification and self.qualification.lower() == "rustica":
|
|
|
|
return None
|
|
|
|
|
|
|
|
match = Parser.search(r"(?:número) * (((?!(?:\.|\,|\;)).)+)", self.text)
|
|
|
|
if match:
|
|
|
|
return match.groups()[0]
|
|
|
|
|
|
|
|
@property
|
|
|
|
# @clean_match
|
|
|
|
def surface (self):
|
|
|
|
match = Parser.search(r"superficie *(?:total)? *(?:construida|edificada)? * (?:de)? *(((?!(?:\.|\,|\;)).)+)", self.text)
|
|
|
|
if match:
|
|
|
|
return match.groups()[0]
|
|
|
|
|
|
|
|
|
|
|
|
class OwnershipParser (Parser):
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def match (text):
|
|
|
|
match = Parser.search(r"TITULO *\: *((?!CARGAS).)+", text)
|
|
|
|
if match:
|
|
|
|
ownership_body = Parser.sub(r"^TITULO *\: *", "", Parser.sub(r"€+", " ", match.group()))
|
|
|
|
return OwnershipParser(ownership_body)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def data (self):
|
|
|
|
data = dict()
|
|
|
|
data["owner"] = self.owner
|
|
|
|
data["nie"] = self.nie
|
|
|
|
data["participation"] = self.participation
|
|
|
|
data["adjudication"] = self.adjudication
|
|
|
|
data["notary"] = self.notary
|
|
|
|
data["town"] = self.town
|
|
|
|
data["date"] = self.date
|
|
|
|
return data
|
|
|
|
|
|
|
|
@property
|
|
|
|
# @clean_match
|
|
|
|
def owner (self):
|
|
|
|
match = Parser.search(r"Titular\/es\:? *(((?!,).)+)", self.text)
|
|
|
|
if match:
|
|
|
|
return match.groups()[0].strip()
|
|
|
|
|
|
|
|
@property
|
|
|
|
# @clean_match
|
|
|
|
def nie (self):
|
|
|
|
match = Parser.search(r"(DNI|CIF).* ([a-zA-Z]?[0-9]{8}[a-zA-Z]?) ", self.text)
|
|
|
|
if match:
|
|
|
|
return match.groups()[1].strip()
|
|
|
|
|
|
|
|
@property
|
|
|
|
# @clean_match
|
|
|
|
def participation (self):
|
|
|
|
match = Parser.search(r"Participación *\: *(((?!Título).)+)", self.text)
|
|
|
|
if match:
|
|
|
|
return match.groups()[0].strip()
|
|
|
|
|
|
|
|
@property
|
|
|
|
# @clean_match
|
|
|
|
def adjudication (self):
|
|
|
|
match = Parser.search(r"Título *: *(((?!Notario).)+)", self.text)
|
|
|
|
if match:
|
|
|
|
return match.groups()[0].strip()
|
|
|
|
|
|
|
|
@property
|
|
|
|
# @clean_match
|
|
|
|
def notary (self):
|
|
|
|
match = Parser.search(r"Notario(?:\/Autoridad)? *: *(((?!Población).)+) ", self.text)
|
|
|
|
if match:
|
|
|
|
return match.groups()[0].strip()
|
|
|
|
|
|
|
|
@property
|
|
|
|
# @clean_match
|
|
|
|
def town (self):
|
|
|
|
match = Parser.search(r"Población *: *(((?!Fecha).)+)", self.text)
|
|
|
|
if match:
|
|
|
|
return match.groups()[0].strip()
|
|
|
|
|
|
|
|
@property
|
|
|
|
# @clean_match
|
|
|
|
def date (self):
|
|
|
|
match = Parser.search(r"Fecha (?:documento|escritura) *: *((?!Protocolo)[0-9]{2}\/[0-9]{2}\/[0-9]{4})", self.text)
|
|
|
|
if match:
|
|
|
|
return match.groups()[0].strip()
|
|
|
|
|
|
|
|
|
|
|
|
class ChargesParser (Parser):
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def match (text):
|
|
|
|
match = Parser.search(r"CARGAS *\: *((?!PRESENTACION).)+", text)
|
|
|
|
if match:
|
|
|
|
charges_body = Parser.sub(r"^CARGAS\: *", "", Parser.sub("€+", " ", match.group()))
|
|
|
|
return ChargesParser(charges_body)
|
|
|
|
|
|
|
|
# def get_presentation (text):
|
|
|
|
# match = re.search(r"(PRESENTACION\:)?((?!-+).)+", text)
|
|
|
|
# if match:
|
|
|
|
# return re.sub("^PRESENTACION\: *", "", re.sub(r"€+", " ", match.group()))
|
|
|
|
|
|
|
|
|
|
|
|
class TextParser (object):
|
|
|
|
|
|
|
|
__cover = CoverParser()
|
|
|
|
__description = DescriptionParser()
|
|
|
|
__ownership = OwnershipParser()
|
|
|
|
__charges = ChargesParser()
|
|
|
|
|
|
|
|
def __init__ (self, text):
|
|
|
|
self.cover = text
|
|
|
|
text = secure_replace(self.cover.text, "", text)
|
|
|
|
self.description = text
|
|
|
|
text = secure_replace(self.cover.text, "", text)
|
|
|
|
self.ownership = text
|
|
|
|
text = secure_replace(self.ownership.text, "", text)
|
|
|
|
self.charges = text
|
|
|
|
|
|
|
|
@property
|
|
|
|
def cover (self):
|
|
|
|
return self.__cover
|
|
|
|
|
|
|
|
@cover.setter
|
|
|
|
def cover (self, cover):
|
|
|
|
if cover:
|
|
|
|
if type(cover) == CoverParser:
|
|
|
|
self.__cover = self.cover + cover
|
|
|
|
elif type(cover) == str:
|
|
|
|
self.__cover = self.cover + CoverParser.match(cover)
|
|
|
|
else:
|
|
|
|
pass
|
|
|
|
raise TypeError("Description property accepts string type values. " + type(cover) + " was found.")
|
|
|
|
|
|
|
|
@property
|
|
|
|
def description (self):
|
|
|
|
return self.__description
|
|
|
|
|
|
|
|
@description.setter
|
|
|
|
def description (self, description):
|
|
|
|
if description:
|
|
|
|
if type(description) == DescriptionParser:
|
|
|
|
self.__description = self.description + description
|
|
|
|
elif type(description) == str:
|
|
|
|
self.__description = self.description + DescriptionParser.match(description)
|
|
|
|
else:
|
|
|
|
pass
|
|
|
|
raise TypeError("Description property accepts string type values. " + type(description) + " was found.")
|
|
|
|
|
|
|
|
@property
|
|
|
|
def ownership (self):
|
|
|
|
return self.__ownership or OwnershipParser()
|
|
|
|
|
|
|
|
@ownership.setter
|
|
|
|
def ownership (self, ownership):
|
|
|
|
if ownership:
|
|
|
|
if type(ownership) == OwnershipParser:
|
|
|
|
self.__ownership = self.ownership + ownership
|
|
|
|
elif type(ownership) == str:
|
|
|
|
self.__ownership = self.ownership + OwnershipParser.match(ownership)
|
|
|
|
else:
|
|
|
|
pass
|
|
|
|
raise TypeError("Ownership property accepts string type value. " + type(ownership) + " was found.")
|
|
|
|
|
|
|
|
@property
|
|
|
|
def charges (self):
|
|
|
|
return self.__charges or ChargesParser()
|
|
|
|
|
|
|
|
@charges.setter
|
|
|
|
def charges (self, charges):
|
|
|
|
if charges:
|
|
|
|
if type(charges) == ChargesParser:
|
|
|
|
self.__charges = self.charges + charges
|
|
|
|
elif type(charges) == str:
|
|
|
|
self.__charges = self.charges + ChargesParser.match(charges)
|
|
|
|
else:
|
|
|
|
pass
|
|
|
|
raise TypeError("Charges property accepts string type value. " + type(charges) + " was found.")
|
|
|
|
|
|
|
|
@property
|
|
|
|
def text (self):
|
|
|
|
return "Description:\r" \
|
|
|
|
+ f"{self.description}\n" \
|
|
|
|
+ "Ownership:\r" \
|
|
|
|
+ f"{self.ownership}\n" \
|
|
|
|
+ "Charges:\r" \
|
|
|
|
+ f"{self.charges}\n"
|
|
|
|
|
|
|
|
@property
|
|
|
|
def success (self):
|
|
|
|
return len(self.description) or len(self.charges) or len(self.ownership)
|
|
|
|
|
|
|
|
def __str__ (self):
|
|
|
|
return self.text
|
|
|
|
|
|
|
|
|
|
|
|
def secure_replace (pattern, replace, string):
|
|
|
|
try:
|
|
|
|
return re.sub(pattern, replace, string)
|
|
|
|
except Exception as e:
|
|
|
|
return string
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
import os
|
|
|
|
|
|
|
|
directory = os.path.relpath(os.path.join(os.path.abspath(os.path.dirname(__file__)), "../pdfs/T1"))
|
|
|
|
for file_name in os.listdir(directory):
|
|
|
|
file_path = os.path.join(directory, file_name)
|
|
|
|
parser = TextParser(file_path)
|
|
|
|
print(file_name.upper())
|
|
|
|
print(parser.description)
|
|
|
|
print(parser.description.data)
|
|
|
|
print()
|