You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
349 lines
10 KiB
349 lines
10 KiB
# BUILT-INS |
|
import re |
|
from unidecode import unidecode |
|
|
|
|
|
class Parser (object): |
|
__text = None |
|
|
|
def __init__ (self, text=""): |
|
self.text = text |
|
|
|
def __str__ (self): |
|
return self.text |
|
|
|
def __add__ (self, other): |
|
if other: |
|
if type(other) == type(self): |
|
return self.__class__(self.text + "\n" + other.text) |
|
elif type(other) == str: |
|
return self.__class__.match(self.text + "\n" + other) |
|
|
|
return self |
|
|
|
def __radd__ (self, other): |
|
if other: |
|
if type(other) == type(self): |
|
return self.__class__(other.text + "\n" + self.text) |
|
elif type(other) == str: |
|
return self.__class__.match(other + "\n" + self.text) |
|
|
|
return self |
|
|
|
def __len__ (self): |
|
return len(self.text) |
|
|
|
@staticmethod |
|
def clean_text (fn): |
|
pass |
|
|
|
@staticmethod |
|
def search (pattern, string): |
|
return re.search(pattern, unidecode(string), flags=re.IGNORECASE) |
|
|
|
@staticmethod |
|
def sub (pattern, string, replace): |
|
return re.sub(pattern, unidecode(string), replace, count=0, flags=re.IGNORECASE) |
|
|
|
@property |
|
def text (self): |
|
return self.__text or "" |
|
|
|
@text.setter |
|
def text (self, text): |
|
if text and type(text) == str: |
|
if self.__text: |
|
self.__text += "\n" + text |
|
else: |
|
self.__text = text |
|
|
|
@property |
|
def data (self): |
|
return self.text |
|
|
|
|
|
class CoverParser (Parser): |
|
|
|
@staticmethod |
|
def match (text): |
|
match = Parser.search(r"((?!DESCRIPCION).)+", text) |
|
if match: |
|
cover_content = Parser.sub(r"DESCRIPCION.*", "", match.group()) |
|
return CoverParser(cover_content) |
|
|
|
|
|
class DescriptionParser (Parser): |
|
|
|
@staticmethod |
|
def match (text): |
|
match = Parser.search(r"DESCRIPCION *\: *((?!(TITULO|TITULARES|TITULARIDADES)).)+", text) |
|
if match: |
|
description_body = Parser.sub(r"^DESCRIPCION\:? *", "", Parser.sub(r"€+", " ", match.group())) |
|
return DescriptionParser(description_body) |
|
|
|
@property |
|
def data (self): |
|
data = dict() |
|
data["qualification"] = self.qualification |
|
data["type"] = self.type |
|
data["town"] = self.town |
|
data["street"] = self.street |
|
data["number"] = self.number |
|
data["surface"] = self.surface |
|
return data |
|
|
|
@property |
|
# @clean_match |
|
def qualification (self): |
|
match = Parser.search(r"(RUSTICA|URBANA)", self.text) |
|
if match: |
|
return match.group() |
|
|
|
type = self.type |
|
type = type and type.lower() or type |
|
if type and "terreno" not in type: |
|
return "URBANA" |
|
|
|
@property |
|
# @clean_match |
|
def type (self): |
|
match = Parser.search(r"(PORCION DE TERRENO|VIVIENDA|CASA)", self.text) |
|
if match: |
|
return match.group() |
|
|
|
@property |
|
# @clean_match |
|
def town (self): |
|
match = Parser.search(r"(?:sita|sito|situado|situada) en (((?!(?:\.|\,|\;)).)+)", self.text) |
|
if match: |
|
return Parser.sub(r" *calle.*", "", match.groups()[0]) |
|
|
|
match = Parser.search(r"(?:PORCION DE TERRENO|VIVIENDA) en (((?!(?:\.|\,|\;)).)+)", self.text) |
|
if match: |
|
return Parser.sub(r" *calle.*", "", match.groups()[0]) |
|
|
|
@property |
|
# @clean_match |
|
def street (self): |
|
if self.qualification and self.qualification.lower() == "rustica": |
|
return None |
|
|
|
match = Parser.search(r"(calle|plaza|camino) *(((?!,).)+)", self.text) |
|
if match: |
|
return match.groups()[0] + " " + Parser.sub(r" *numero.*", "", match.groups()[1]) |
|
|
|
@property |
|
# @clean_match |
|
def number (self): |
|
if self.qualification and self.qualification.lower() == "rustica": |
|
return None |
|
|
|
match = Parser.search(r"(?:número) * (((?!(?:\.|\,|\;)).)+)", self.text) |
|
if match: |
|
return match.groups()[0] |
|
|
|
@property |
|
# @clean_match |
|
def surface (self): |
|
match = Parser.search(r"superficie *(?:total)? *(?:construida|edificada)? * (?:de)? *(((?!(?:\.|\,|\;)).)+)", self.text) |
|
if match: |
|
return match.groups()[0] |
|
|
|
|
|
class OwnershipParser (Parser): |
|
|
|
@staticmethod |
|
def match (text): |
|
match = Parser.search(r"TITULO *\: *((?!CARGAS).)+", text) |
|
if match: |
|
ownership_body = Parser.sub(r"^TITULO *\: *", "", Parser.sub(r"€+", " ", match.group())) |
|
return OwnershipParser(ownership_body) |
|
|
|
@property |
|
def data (self): |
|
data = dict() |
|
data["owner"] = self.owner |
|
data["nie"] = self.nie |
|
data["participation"] = self.participation |
|
data["adjudication"] = self.adjudication |
|
data["notary"] = self.notary |
|
data["town"] = self.town |
|
data["date"] = self.date |
|
return data |
|
|
|
@property |
|
# @clean_match |
|
def owner (self): |
|
match = Parser.search(r"Titular\/es\:? *(((?!,).)+)", self.text) |
|
if match: |
|
return match.groups()[0].strip() |
|
|
|
@property |
|
# @clean_match |
|
def nie (self): |
|
match = Parser.search(r"(DNI|CIF).* ([a-zA-Z]?[0-9]{8}[a-zA-Z]?) ", self.text) |
|
if match: |
|
return match.groups()[1].strip() |
|
|
|
@property |
|
# @clean_match |
|
def participation (self): |
|
match = Parser.search(r"Participación *\: *(((?!Título).)+)", self.text) |
|
if match: |
|
return match.groups()[0].strip() |
|
|
|
@property |
|
# @clean_match |
|
def adjudication (self): |
|
match = Parser.search(r"Título *: *(((?!Notario).)+)", self.text) |
|
if match: |
|
return match.groups()[0].strip() |
|
|
|
@property |
|
# @clean_match |
|
def notary (self): |
|
match = Parser.search(r"Notario(?:\/Autoridad)? *: *(((?!Población).)+) ", self.text) |
|
if match: |
|
return match.groups()[0].strip() |
|
|
|
@property |
|
# @clean_match |
|
def town (self): |
|
match = Parser.search(r"Población *: *(((?!Fecha).)+)", self.text) |
|
if match: |
|
return match.groups()[0].strip() |
|
|
|
@property |
|
# @clean_match |
|
def date (self): |
|
match = Parser.search(r"Fecha (?:documento|escritura) *: *((?!Protocolo)[0-9]{2}\/[0-9]{2}\/[0-9]{4})", self.text) |
|
if match: |
|
return match.groups()[0].strip() |
|
|
|
|
|
class ChargesParser (Parser): |
|
|
|
@staticmethod |
|
def match (text): |
|
match = Parser.search(r"CARGAS *\: *((?!PRESENTACION).)+", text) |
|
if match: |
|
charges_body = Parser.sub(r"^CARGAS\: *", "", Parser.sub("€+", " ", match.group())) |
|
return ChargesParser(charges_body) |
|
|
|
# def get_presentation (text): |
|
# match = re.search(r"(PRESENTACION\:)?((?!-+).)+", text) |
|
# if match: |
|
# return re.sub("^PRESENTACION\: *", "", re.sub(r"€+", " ", match.group())) |
|
|
|
|
|
class TextParser (object): |
|
|
|
__cover = CoverParser() |
|
__description = DescriptionParser() |
|
__ownership = OwnershipParser() |
|
__charges = ChargesParser() |
|
|
|
def __init__ (self, text): |
|
self.cover = text |
|
text = secure_replace(self.cover.text, "", text) |
|
self.description = text |
|
text = secure_replace(self.cover.text, "", text) |
|
self.ownership = text |
|
text = secure_replace(self.ownership.text, "", text) |
|
self.charges = text |
|
|
|
@property |
|
def cover (self): |
|
return self.__cover |
|
|
|
@cover.setter |
|
def cover (self, cover): |
|
if cover: |
|
if type(cover) == CoverParser: |
|
self.__cover = self.cover + cover |
|
elif type(cover) == str: |
|
self.__cover = self.cover + CoverParser.match(cover) |
|
else: |
|
pass |
|
raise TypeError("Description property accepts string type values. " + type(cover) + " was found.") |
|
|
|
@property |
|
def description (self): |
|
return self.__description |
|
|
|
@description.setter |
|
def description (self, description): |
|
if description: |
|
if type(description) == DescriptionParser: |
|
self.__description = self.description + description |
|
elif type(description) == str: |
|
self.__description = self.description + DescriptionParser.match(description) |
|
else: |
|
pass |
|
raise TypeError("Description property accepts string type values. " + type(description) + " was found.") |
|
|
|
@property |
|
def ownership (self): |
|
return self.__ownership or OwnershipParser() |
|
|
|
@ownership.setter |
|
def ownership (self, ownership): |
|
if ownership: |
|
if type(ownership) == OwnershipParser: |
|
self.__ownership = self.ownership + ownership |
|
elif type(ownership) == str: |
|
self.__ownership = self.ownership + OwnershipParser.match(ownership) |
|
else: |
|
pass |
|
raise TypeError("Ownership property accepts string type value. " + type(ownership) + " was found.") |
|
|
|
@property |
|
def charges (self): |
|
return self.__charges or ChargesParser() |
|
|
|
@charges.setter |
|
def charges (self, charges): |
|
if charges: |
|
if type(charges) == ChargesParser: |
|
self.__charges = self.charges + charges |
|
elif type(charges) == str: |
|
self.__charges = self.charges + ChargesParser.match(charges) |
|
else: |
|
pass |
|
raise TypeError("Charges property accepts string type value. " + type(charges) + " was found.") |
|
|
|
@property |
|
def text (self): |
|
return "Description:\r" \ |
|
+ f"{self.description}\n" \ |
|
+ "Ownership:\r" \ |
|
+ f"{self.ownership}\n" \ |
|
+ "Charges:\r" \ |
|
+ f"{self.charges}\n" |
|
|
|
@property |
|
def success (self): |
|
return len(self.description) or len(self.charges) or len(self.ownership) |
|
|
|
def __str__ (self): |
|
return self.text |
|
|
|
|
|
def secure_replace (pattern, replace, string): |
|
try: |
|
return re.sub(pattern, replace, string) |
|
except Exception as e: |
|
return string |
|
|
|
|
|
if __name__ == "__main__": |
|
import os |
|
|
|
directory = os.path.relpath(os.path.join(os.path.abspath(os.path.dirname(__file__)), "../pdfs/T1")) |
|
for file_name in os.listdir(directory): |
|
file_path = os.path.join(directory, file_name) |
|
parser = TextParser(file_path) |
|
print(file_name.upper()) |
|
print(parser.description) |
|
print(parser.description.data) |
|
print()
|
|
|