# BUILT-INS import re from unidecode import unidecode class Parser (object): __text = None def __init__ (self, text=""): self.text = text def __str__ (self): return self.text def __add__ (self, other): if other: if type(other) == type(self): return self.__class__(self.text + "\n" + other.text) elif type(other) == str: return self.__class__.match(self.text + "\n" + other) return self def __radd__ (self, other): if other: if type(other) == type(self): return self.__class__(other.text + "\n" + self.text) elif type(other) == str: return self.__class__.match(other + "\n" + self.text) return self def __len__ (self): return len(self.text) @staticmethod def clean_text (fn): pass @staticmethod def search (pattern, string): return re.search(pattern, unidecode(string), flags=re.IGNORECASE) @staticmethod def sub (pattern, string, replace): return re.sub(pattern, unidecode(string), replace, count=0, flags=re.IGNORECASE) @property def text (self): return self.__text or "" @text.setter def text (self, text): if text and type(text) == str: if self.__text: self.__text += "\n" + text else: self.__text = text @property def data (self): return self.text class CoverParser (Parser): @staticmethod def match (text): match = Parser.search(r"((?!DESCRIPCION).)+", text) if match: cover_content = Parser.sub(r"DESCRIPCION.*", "", match.group()) return CoverParser(cover_content) class DescriptionParser (Parser): @staticmethod def match (text): match = Parser.search(r"DESCRIPCION *\: *((?!(TITULO|TITULARES|TITULARIDADES)).)+", text) if match: description_body = Parser.sub(r"^DESCRIPCION\:? *", "", Parser.sub(r"€+", " ", match.group())) return DescriptionParser(description_body) @property def data (self): data = dict() data["qualification"] = self.qualification data["type"] = self.type data["town"] = self.town data["street"] = self.street data["number"] = self.number data["surface"] = self.surface return data @property # @clean_match def qualification (self): match = Parser.search(r"(RUSTICA|URBANA)", self.text) if match: return match.group() type = self.type type = type and type.lower() or type if type and "terreno" not in type: return "URBANA" @property # @clean_match def type (self): match = Parser.search(r"(PORCION DE TERRENO|VIVIENDA|CASA)", self.text) if match: return match.group() @property # @clean_match def town (self): match = Parser.search(r"(?:sita|sito|situado|situada) en (((?!(?:\.|\,|\;)).)+)", self.text) if match: return Parser.sub(r" *calle.*", "", match.groups()[0]) match = Parser.search(r"(?:PORCION DE TERRENO|VIVIENDA) en (((?!(?:\.|\,|\;)).)+)", self.text) if match: return Parser.sub(r" *calle.*", "", match.groups()[0]) @property # @clean_match def street (self): if self.qualification and self.qualification.lower() == "rustica": return None match = Parser.search(r"(calle|plaza|camino) *(((?!,).)+)", self.text) if match: return match.groups()[0] + " " + Parser.sub(r" *numero.*", "", match.groups()[1]) @property # @clean_match def number (self): if self.qualification and self.qualification.lower() == "rustica": return None match = Parser.search(r"(?:número) * (((?!(?:\.|\,|\;)).)+)", self.text) if match: return match.groups()[0] @property # @clean_match def surface (self): match = Parser.search(r"superficie *(?:total)? *(?:construida|edificada)? * (?:de)? *(((?!(?:\.|\,|\;)).)+)", self.text) if match: return match.groups()[0] class OwnershipParser (Parser): @staticmethod def match (text): match = Parser.search(r"TITULO *\: *((?!CARGAS).)+", text) if match: ownership_body = Parser.sub(r"^TITULO *\: *", "", Parser.sub(r"€+", " ", match.group())) return OwnershipParser(ownership_body) @property def data (self): data = dict() data["owner"] = self.owner data["nie"] = self.nie data["participation"] = self.participation data["adjudication"] = self.adjudication data["notary"] = self.notary data["town"] = self.town data["date"] = self.date return data @property # @clean_match def owner (self): match = Parser.search(r"Titular\/es\:? *(((?!,).)+)", self.text) if match: return match.groups()[0].strip() @property # @clean_match def nie (self): match = Parser.search(r"(DNI|CIF).* ([a-zA-Z]?[0-9]{8}[a-zA-Z]?) ", self.text) if match: return match.groups()[1].strip() @property # @clean_match def participation (self): match = Parser.search(r"Participación *\: *(((?!Título).)+)", self.text) if match: return match.groups()[0].strip() @property # @clean_match def adjudication (self): match = Parser.search(r"Título *: *(((?!Notario).)+)", self.text) if match: return match.groups()[0].strip() @property # @clean_match def notary (self): match = Parser.search(r"Notario(?:\/Autoridad)? *: *(((?!Población).)+) ", self.text) if match: return match.groups()[0].strip() @property # @clean_match def town (self): match = Parser.search(r"Población *: *(((?!Fecha).)+)", self.text) if match: return match.groups()[0].strip() @property # @clean_match def date (self): match = Parser.search(r"Fecha (?:documento|escritura) *: *((?!Protocolo)[0-9]{2}\/[0-9]{2}\/[0-9]{4})", self.text) if match: return match.groups()[0].strip() class ChargesParser (Parser): @staticmethod def match (text): match = Parser.search(r"CARGAS *\: *((?!PRESENTACION).)+", text) if match: charges_body = Parser.sub(r"^CARGAS\: *", "", Parser.sub("€+", " ", match.group())) return ChargesParser(charges_body) # def get_presentation (text): # match = re.search(r"(PRESENTACION\:)?((?!-+).)+", text) # if match: # return re.sub("^PRESENTACION\: *", "", re.sub(r"€+", " ", match.group())) class TextParser (object): __cover = CoverParser() __description = DescriptionParser() __ownership = OwnershipParser() __charges = ChargesParser() def __init__ (self, text): self.cover = text text = secure_replace(self.cover.text, "", text) self.description = text text = secure_replace(self.cover.text, "", text) self.ownership = text text = secure_replace(self.ownership.text, "", text) self.charges = text @property def cover (self): return self.__cover @cover.setter def cover (self, cover): if cover: if type(cover) == CoverParser: self.__cover = self.cover + cover elif type(cover) == str: self.__cover = self.cover + CoverParser.match(cover) else: pass raise TypeError("Description property accepts string type values. " + type(cover) + " was found.") @property def description (self): return self.__description @description.setter def description (self, description): if description: if type(description) == DescriptionParser: self.__description = self.description + description elif type(description) == str: self.__description = self.description + DescriptionParser.match(description) else: pass raise TypeError("Description property accepts string type values. " + type(description) + " was found.") @property def ownership (self): return self.__ownership or OwnershipParser() @ownership.setter def ownership (self, ownership): if ownership: if type(ownership) == OwnershipParser: self.__ownership = self.ownership + ownership elif type(ownership) == str: self.__ownership = self.ownership + OwnershipParser.match(ownership) else: pass raise TypeError("Ownership property accepts string type value. " + type(ownership) + " was found.") @property def charges (self): return self.__charges or ChargesParser() @charges.setter def charges (self, charges): if charges: if type(charges) == ChargesParser: self.__charges = self.charges + charges elif type(charges) == str: self.__charges = self.charges + ChargesParser.match(charges) else: pass raise TypeError("Charges property accepts string type value. " + type(charges) + " was found.") @property def text (self): return "Description:\r" \ + f"{self.description}\n" \ + "Ownership:\r" \ + f"{self.ownership}\n" \ + "Charges:\r" \ + f"{self.charges}\n" @property def success (self): return len(self.description) or len(self.charges) or len(self.ownership) def __str__ (self): return self.text def secure_replace (pattern, replace, string): try: return re.sub(pattern, replace, string) except Exception as e: return string if __name__ == "__main__": import os directory = os.path.relpath(os.path.join(os.path.abspath(os.path.dirname(__file__)), "../pdfs/T1")) for file_name in os.listdir(directory): file_path = os.path.join(directory, file_name) parser = TextParser(file_path) print(file_name.upper()) print(parser.description) print(parser.description.data) print()