Browse Source

Reestructuració de la part del server

develop
orzo 7 months ago
parent
commit
e3eb291623
  1. 2
      .gitignore
  2. 2
      client/index.js
  3. 3
      package-lock.json
  4. 1
      parsers/__init__.py
  5. 200
      parsers/geocoding.py
  6. 2
      requirements.txt
  7. 1
      server/__init__.py
  8. 46
      server/log/log.txt
  9. 7
      server/requirements.txt
  10. 13
      server/src/api/main.py
  11. 96
      server/src/parsers/geocoding.py
  12. 41
      server/src/parsers/image.py
  13. 169
      server/src/parsers/numeric.py
  14. 6
      server/src/parsers/pdf.py
  15. 182
      server/src/parsers/text.py
  16. 10
      server/static/index.html
  17. 8
      server/test.py
  18. 20
      server/wsgi.py

2
.gitignore vendored

@ -3,3 +3,5 @@
pdfs
images
node_modules
__pycache__
log/*

2
client/index.js

@ -29,8 +29,6 @@ app.post('/parceles', function(req, res){
ST_GeomFromText('POLYGON((${req.body.east} ${req.body.north}, ${req.body.east} ${req.body.south},${req.body.west} ${req.body.south}, ${req.body.west} ${req.body.north}, ${req.body.east} ${req.body.north}))', 4326)
);`;
console.log(query);
db.spatialite(function(err){
const features = [];
db.each(query, function (err, row) {

3
package-lock.json generated

@ -1,3 +0,0 @@
{
"lockfileVersion": 1
}

1
parsers/__init__.py

@ -1 +0,0 @@
from .pdf import PdfParser

200
parsers/geocoding.py

@ -1,200 +0,0 @@
import geocoder
import re
import json
def parse_number (chars):
if not chars:
return ""
chars = chars.lower()
try:
return int(chars)
except Exception as e:
decimals = {
"uno": 1,
"un": 1,
"dos": 2,
"tres": 3,
"cuatro": 4,
"quatre": 4,
"cinco": 5,
"cinc": 5,
"seis": 6,
"sis": 6,
"siete": 7,
"set": 7,
"ocho": 8,
"vuit": 8,
"nueve": 9,
"nou": 9
}
teens = {
"once": 11,
"onze": 11,
"doce": 12,
"dotze": 12,
"trece": 13,
"tretze": 13,
"catorce": 14,
"catorze": 14,
"quince": 15,
"quinze": 15,
"dieziseis": 16,
"setze": 16,
"diezisiete": 17,
"diset": 17,
"dieziocho": 18,
"divuit": 18,
"diezinueve": 19,
"dinou": 19
}
tenths = {
"diez": 10,
"deu": 10,
"veinte": 20,
"veint": 20,
"vint": 20,
"treinta": 30,
"trenta": 30,
"cuarenta": 40,
"quaranta": 40,
"cincuenta": 50,
"cinquanta": 50,
"sesenta": 60,
"seixanta": 60,
"setenta": 70,
"ochenta": 80,
"vuitanta": 80,
"noventa": 90,
"noranta": 90,
}
hundreds = {
"cien": 100,
"cent": 100,
}
thousands = {
"mil": 1000
}
number = 0
for decimal in decimals:
match = re.search(r"(?<![a-z])(y|i|-)? *(%s)(?![a-z])" % decimal, chars)
if match:
number += decimals.get(match.groups()[1])
for teen in teens:
match = re.search(r"(?<=[a-z])(%s)(?![a-z])" % teen, chars)
if match:
number += teens.get(match.groups()[0])
for tenth in tenths:
match = re.search(r"(?<![a-z])(%s) *(?:y|i|-)? *([a-z]+)?(?![a-z])? *" % tenth, chars)
if match:
number += tenths.get(match.groups()[0])
for hundred in hundreds:
match = re.search(r"(?<![a-z])([a-z]+)?%s(?:tos?|s?)?(?![a-z])" % hundred, chars)
if match:
number += (decimals.get(match.groups()[0])) or 1 * 100
for thousand in thousands:
match = re.search(r"(?<![a-z])([a-z]+)?%s(?![a-z])" % thousand, chars)
if match:
print("match thousands")
number += (decimals.get(match.groups()[0])) or 1 * 1000
return number
def parse_street (chars):
if not chars:
return ""
street = " ".join([chunk.lower() for word in chars.split(" ") for chunk in re.split(r"([A-Z][a-z]+)", word) if chunk])
return street
def parse_town (chars):
if not chars:
return ""
town = " ".join([chunk.lower() for word in chars.split(" ") for chunk in re.split(r"([A-Z][a-z]+)", word) if chunk])
town = re.sub(r"(villa|vila)( *de *)?", "", town)
return town
def build_address (record):
data = record.description.data
address = "{number} {street}, {town}, cataluña, españa".format(
number=parse_number(data.get("number")),
street=parse_street(data.get("street")),
town=parse_town(data.get("town"))
)
address = re.sub(r"^ *, *", "", re.sub(r"(?<=,) *,", "", address))
return address
class GeoLocation (object):
def __init__ (self, parser):
self._data = self.geolocate(parser.data).json or {}
setattr(parser, "geolocation", self)
def geolocate (self, record):
address = build_address(record)
res = geocoder.osm(address)
return res
@property
def latlng (self):
return [self._data.get("lat"), self._data.get("lng")]
@property
def address (self):
return self._data.get("address")
@property
def street (self):
return self._data.get("street")
@property
def postcode (self):
return self._data.get("postcode")
@property
def town (self):
return self._data.get("town")
@property
def county (self):
return self._data.get("county")
@property
def region (self):
return self._data.get("region")
@property
def country (self):
return self._data.get("country")
@property
def data (self):
return {
"latlng": self.latlng,
"street": self.street,
"postcode": self.postcode,
"town": self.town,
"county": self.county,
"region": self.region,
"country": self.country,
"address": self.address
}
def __str__ (self):
return json.dumps(self.data)

2
requirements.txt

@ -1,2 +0,0 @@
pytesseract==0.3.7
pdf2image==1.14.0

1
server/__init__.py

@ -0,0 +1 @@
from .main import app

46
server/log/log.txt

@ -0,0 +1,46 @@
# FILENAME
F74ZN39.PDF
Format vectorial
## PLAIN_TEXT
Description:
URBANA: DEPARTAMENTO OCHO.- VIVIENDA en la planta piso segundo, puertasegunda de la casa en construccion, sita en la villa de Breda y calle SanAntonio, numero uno. Tiene una superficie construida de noventa y seis metrosveinte decimetros cuadrados, destinados a vivienda, compuesta de distribuidor,sala, cocina, dos baños, despensa y tres habitaciones. Tiene como anejo de usoexclusivo y excluyente del departamento, un balcon de superficie construidasiete metros cincuenta decimetros cuadrados. Tiene la vivienda una superficieutil de ochenta y cinco metros cincuenta y siete decimetros cuadrados. LINDA:por su frente, tomando como tal su puerta de entrada, con vestibulo y rellano deescalera; por la derecha entrando, con departamento numero nueve de esta mismaplanta segunda; por la izquierda, con departamento numero siete de esta mismaplanta segunda; y por el fondo, con vuelo de la calle Sant Pere. Coeficiente departicipacion en la propiedad horizontal: Ocho enteros cincuenta y sietecentesimas por ciento.
Ownership:
Titular/es:JAVIER MORENO CHAPARRO, con DNI numero 28416614EParticipacion:la total finca Titulo: ADJUDICACIONNotario/Autoridad: Jose Maria Chiner VivesPoblacion: ARBUCIESFecha documento: 14/01/2004Protocolo: 49C.S.V.: 2170189976B8A692Pag: 2 of
Charges:
1) Nota fiscal, cuatro años, Impuesto de TP/AJD, fecha 20/01/2003, almargen de la inscripcion 9ª. 2) Afecta a las limitaciones derivadas del regimen de propiedadhorizontal, en que se halla constituida la total finca, segun escrituraautorizada por el notario de SANT CELONI, ENRIQUE PEREZ MENCIO, el dieciocho dejunio de dos mil dos, que motivo la inscripcion 9ª de la finca matriz numero220, al folio 89 del tomo 2282, libro 54 de BREDA. 3) Nota fiscal, cuatro años, Impuesto de TP/AJD, fecha 18/03/2004, almargen de la inscripcion 2ª.
## STRUCTURED
description: {
"qualification": "URBANA",
"type": "VIVIENDA",
"town": "la villa de Breda y",
"street": "calle SanAntonio",
"number": "uno",
"surface": "noventa y seis (metro)"
}
ownership: {
"owner": "JAVIER MORENO CHAPARRO",
"nie": "21701899",
"participation": "la total finca",
"adjudication": "ADJUDICACION",
"notary": "Jose Maria Chiner",
"town": "ARBUCIES",
"date": "14/01/2004"
}
## GEOLOCATION
{
"latlng": [
null,
null
],
"street": null,
"postcode": null,
"town": null,
"county": null,
"region": null,
"country": null,
"address": null
}

7
server/requirements.txt

@ -0,0 +1,7 @@
pytesseract==0.3.7
pdf2image==1.14.0
pypdf2
pyspellchecker
numpy
opencv-python
geocoder

13
server/src/api/main.py

@ -0,0 +1,13 @@
from fastapi import FastAPI, UploadFile, File
app = FastAPI()
@app.get("/")
async def index():
return {"message": "Beinvguda al portal de les desregistradores"}
@app.post("/upload")
async def upload(file: UploadFile = File(...)):
return {"filename": file.filename}

96
server/src/parsers/geocoding.py

@ -0,0 +1,96 @@
# VENDOR
import geocoder
# BUILT-INS
import re
import json
# SOURCE
from server.src.parsers.numeric import Numeric
def parse_street (chars):
if not chars:
return ""
street = " ".join([chunk.lower() for word in chars.split(" ") for chunk in re.split(r"([A-Z][a-z]+)", word) if chunk])
return street
def parse_town (chars):
if not chars:
return ""
town = " ".join([chunk.lower() for word in chars.split(" ") for chunk in re.split(r"([A-Z][a-z]+)", word) if chunk])
town = re.sub(r"(villa|vila)( *de *)?", "", town)
return town
def build_address (record):
data = record.description.data
address = "{number} {street}, {town}, cataluña, españa".format(
number=Numeric(data.get("number")),
street=parse_street(data.get("street")),
town=parse_town(data.get("town"))
)
address = re.sub(r"^ *, *", "", re.sub(r"(?<=,) *,", "", address))
return address
class GeoLocation (object):
def __init__ (self, parser):
self._data = self.geolocate(parser.data).json or {}
setattr(parser, "geolocation", self)
def geolocate (self, record):
address = build_address(record)
res = geocoder.osm(address)
return res
@property
def latlng (self):
return [self._data.get("lat"), self._data.get("lng")]
@property
def address (self):
return self._data.get("address")
@property
def street (self):
return self._data.get("street")
@property
def postcode (self):
return self._data.get("postcode")
@property
def town (self):
return self._data.get("town")
@property
def county (self):
return self._data.get("county")
@property
def region (self):
return self._data.get("region")
@property
def country (self):
return self._data.get("country")
@property
def data (self):
return {
"latlng": self.latlng,
"street": self.street,
"postcode": self.postcode,
"town": self.town,
"county": self.county,
"region": self.region,
"country": self.country,
"address": self.address
}
def __str__ (self):
return json.dumps(self.data)

41
parsers/image.py → server/src/parsers/image.py

@ -1,17 +1,26 @@
# BUILT-INS
import os
import re
from PIL import Image
# VENDOR
import cv2
import numpy as np
import pytesseract
import pdf2image
from matplotlib import pyplot as plt
def increase_contrast (img):
lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
l,a,b = cv2.split(lab)
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
cl = clahe.apply(l)
limg = cv2.merge((cl, a, b))
return cv2.cvtColor(limg, cv2.COLOR_LAB2BGR)
def get_grayscale (img):
return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
return cv2.cvtColor(cv2.cvtColor(img, cv2.COLOR_BGR2GRAY), cv2.COLOR_LAB2BGR)
def binarize (img):
@ -23,7 +32,7 @@ def remove_noise (img):
def thresholding (img):
return cv2.threshold(img, .0, 255., cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
return cv2.threshold(img, .0, 255., cv2.THRESH_BINARY+cv2.THRESH_OTSU)[1]
def dilate (img):
@ -70,8 +79,10 @@ def get_rotation (img):
def pdf_to_images (file_path):
directory = os.path.relpath(os.path.join(os.path.abspath(os.path.dirname(__file__)), "../images"))
subdirectory = os.path.join(directory, os.path.basename(os.path.splitext(file_path)[0]))
doc_name = os.path.basename(os.path.splitext(file_path)[0])
directory = re.sub(r"\/pdfs.*$", "/images", file_path)
# directory = os.path.relpath(os.path.join(os.path.abspath(os.path.dirname(__file__)), "../images"))
subdirectory = os.path.join(directory, doc_name)
if os.path.isdir(subdirectory):
for file_name in os.listdir(subdirectory):
os.remove(os.path.join(subdirectory, file_name))
@ -113,13 +124,15 @@ class ImageParser (object):
def preprocess (self):
preprocessed = []
for img in self.images:
img = increase_contrast(img)
img = get_grayscale(img)
img = remove_noise(img)
img = binarize(img)
img = opening(img)
# img = remove_noise(img)
# img = thresholding(img)
# img = binarize(img)
# img = opening(img)
# img = erode(img)
# img = dilate(img)
# self.show_image("Test", img)
self.show_image("Test", img)
preprocessed.append(img)
@ -142,15 +155,17 @@ class ImageParser (object):
# cv2.imshow("Image", get_grayscale(cv2img))
def show_image (self, name, img):
cv2.imshow(name, img)
cv2.waitKey(0)
cv2.destroyAllWindows()
plt.subplot(121), plt.imshow(img), plt.title(name)
plt.xticks([]), plt.yticks([])
plt.show()
# cv2.imshow(name, img)
# cv2.waitKey(0)
# cv2.destroyAllWindows()
if __name__ == "__main__":
file_path = os.path.join("../pdfs/T1/Verdi 106.pdf")
parser = ImageParser(file_path)
parser.preprocess()
print(parser.text)
i = 1
out_path = os.path.join("../images")

169
server/src/parsers/numeric.py

@ -0,0 +1,169 @@
import re
class Numeric (object):
decimals = {
"uno": 1,
"un": 1,
"dos": 2,
"tres": 3,
"cuatro": 4,
"quatre": 4,
"cinco": 5,
"cinc": 5,
"seis": 6,
"sis": 6,
"siete": 7,
"set": 7,
"ocho": 8,
"vuit": 8,
"nueve": 9,
"nou": 9
}
teens = {
"once": 11,
"onze": 11,
"doce": 12,
"dotze": 12,
"trece": 13,
"tretze": 13,
"catorce": 14,
"catorze": 14,
"quince": 15,
"quinze": 15,
"dieziseis": 16,
"setze": 16,
"diezisiete": 17,
"diset": 17,
"dieziocho": 18,
"divuit": 18,
"diezinueve": 19,
"dinou": 19
}
tenths = {
"diez": 10,
"deu": 10,
"veinte": 20,
"veint": 20,
"vint": 20,
"treinta": 30,
"trenta": 30,
"cuarenta": 40,
"quaranta": 40,
"cincuenta": 50,
"cinquanta": 50,
"sesenta": 60,
"seixanta": 60,
"setenta": 70,
"ochenta": 80,
"vuitanta": 80,
"noventa": 90,
"noranta": 90,
}
hundreds = {
"cien": 100,
"cent": 100,
}
thousands = {
"mil": 1000
}
def __init__ (self, value=None):
self.__raw = value or ""
def __add__ (self, other):
if other:
if type(other) == int or type(other) == float:
return Numeric(self.val + other)
elif type(other) == str and other.isnumeric():
return Numeric(self.val + float(other))
else:
return Numeric(self.val + Numeric(other))
return self
def __radd__ (self, other):
if other:
if type(other) == int or type(other) == float:
return Numeric(self.val + other)
elif type(other) == str and other.isnumeric():
return Numeric(self.val + float(other))
else:
return Numeric(self.val + Numeric(other))
return self
def __str__ (self):
val = ""
try:
val = str(int(self.val))
except:
try:
val = str(self.val)
except:
pass
finally:
return val
@property
def val (self):
return self.parse(self.__raw)
@staticmethod
def tokens ():
num = Numeric()
return list(num.decimals.keys()) \
+ list(num.teens.keys()) \
+ list(num.tenths.keys()) \
+ list(num.hundreds.keys()) \
+ list(num.thousands.keys())
def parse (self, val):
if type(val) == int or type(val) == float:
return val
elif type(val) == str:
if val.isnumeric():
return float(val)
elif val.isalnum():
return self.alnum_to_num(val)
else:
raise TypeError("Can't parse value")
def alnum_to_num (self, chars):
if not chars:
return 0
chars = chars.lower()
number = 0
for decimal in self.decimals:
match = re.search(r"(?<![a-z])[yi-]? *(%s)(?![a-z])" % decimal, chars)
if match:
number += self.decimals.get(match.groups()[1])
for teen in self.teens:
match = re.search(r"(?<=[a-z])(%s)(?![a-z])" % teen, chars)
if match:
number += self.teens.get(match.groups()[0])
for tenth in self.tenths:
match = re.search(r"(?<![a-z])(%s) *[yi-]? *([a-z]+)?(?![a-z])? *" % tenth, chars)
if match:
number += self.tenths.get(match.groups()[0])
for hundred in self.hundreds:
match = re.search(r"(?<![a-z])([a-z]+)?%s(?:tos?|s?)?(?![a-z])" % hundred, chars)
if match:
number += (self.decimals.get(match.groups()[0])) or 1 * 100
for thousand in self.thousands:
match = re.search(r"(?<![a-z])([a-z]+)?%s(?![a-z])" % thousand, chars)
if match:
print("match thousands")
number += (self.decimals.get(match.groups()[0])) or 1 * 1000
return number

6
parsers/pdf.py → server/src/parsers/pdf.py

@ -5,8 +5,8 @@ import os
from PyPDF2 import PdfFileReader
# SOURCE
from .text import TextParser
from .image import ImageParser
from server.src.parsers.text import TextParser
from server.src.parsers.image import ImageParser
class PdfParser (object):
@ -40,7 +40,7 @@ class PdfParser (object):
return TextParser(text)
else:
return TextParser(ImageParser(self.file_path).text)
return TextParser(ImageParser(self.file_path).text, spell=True)
def __str__ (self):
return self.data.text

182
parsers/text.py → server/src/parsers/text.py

@ -1,6 +1,11 @@
# BUILT-INS
import re
from unidecode import unidecode
from spellchecker import SpellChecker
# from unidecode import unidecode
from server.src.parsers.numeric import Numeric
spell = SpellChecker(language="es")
class Parser (object):
@ -34,16 +39,91 @@ class Parser (object):
return len(self.text)
@staticmethod
def clean_text (fn):
pass
def decamelize (text):
decamelized = text
for capital in re.findall(r"(?<=[a-z])[A-Z]", text):
decamelized = decamelized[:decamelized.index(capital)] + " " + capital + decamelized[decamelized.index(capital):]
return decamelized
@staticmethod
def spell (text):
return " ".join([spell.correction(word) for word in text.split(" ")])
@staticmethod
def sanitize (text):
return re.sub(r"(à|á|ä)", "a",
re.sub(
r"(è|é|ë)", "e",
re.sub(
r"(ì|í|ï)", "i",
re.sub(
r"(ò|ó|ö)", "o",
re.sub(
r"(ù|ú|ü)", "u",
text,
flags=re.IGNORECASE
),
flags=re.IGNORECASE
),
flags=re.IGNORECASE
),
flags=re.IGNORECASE
),
flags=re.IGNORECASE
)
@staticmethod
def clean_text (text):
try:
return re.sub(
r" +(?=(\,|\.|\:|\;))", "",
re.sub(
r"(?<= ) +", " ",
re.sub(
r"€+", "",
re.sub(
r"(?<=\n) +", "",
Parser.sanitize(text.strip()),
flags=re.IGNORECASE
),
flags=re.IGNORECASE
),
flags=re.IGNORECASE
),
flags=re.IGNORECASE
)
except Exception as e:
print(e)
print("Error on Parser.clean_text()")
return text
@staticmethod
def search (pattern, string):
return re.search(pattern, unidecode(string), flags=re.IGNORECASE)
try:
return re.search(pattern, string, flags=re.IGNORECASE)
except Exception as e:
print(e)
print("Error on Parser.search()")
return None
@staticmethod
def sub (pattern, replace, string):
try:
return re.sub(pattern, replace, string, count=0, flags=re.IGNORECASE)
except Exception as e:
print(e)
print("Error on Parser.sub()")
return string
@staticmethod
def sub (pattern, string, replace):
return re.sub(pattern, unidecode(string), replace, count=0, flags=re.IGNORECASE)
def findall (pattern, string):
try:
return re.findall(pattern, string, flags=re.IGNORECASE)
except Exception as e:
print(e)
print("Error on Parser.findall()")
return None
@property
def text (self):
@ -66,9 +146,9 @@ class CoverParser (Parser):
@staticmethod
def match (text):
match = Parser.search(r"((?!DESCRIPCION).)+", text)
match = Parser.search(r"((?!descripcion).)+", text)
if match:
cover_content = Parser.sub(r"DESCRIPCION.*", "", match.group())
cover_content = Parser.sub(r"descripcion.*", "", match.group())
return CoverParser(cover_content)
@ -76,9 +156,9 @@ class DescriptionParser (Parser):
@staticmethod
def match (text):
match = Parser.search(r"DESCRIPCION *\: *((?!(TITULO|TITULARES|TITULARIDADES)).)+", text)
match = Parser.search(r"descripcion *\: *((?!(titulo|titulares|titularidades)).)+", text)
if match:
description_body = Parser.sub(r"^DESCRIPCION\:? *", "", Parser.sub(r"€+", " ", match.group()))
description_body = Parser.sub(r"^descripcion\:? *", "", match.group())
return DescriptionParser(description_body)
@property
@ -93,70 +173,76 @@ class DescriptionParser (Parser):
return data
@property
# @clean_match
def qualification (self):
match = Parser.search(r"(RUSTICA|URBANA)", self.text)
match = Parser.search(r"(rustica|urbana)", self.text)
if match:
return match.group()
type = self.type
type = type and type.lower() or type
if type and "terreno" not in type:
return "URBANA"
return "urbana"
@property
# @clean_match
def type (self):
match = Parser.search(r"(PORCION DE TERRENO|VIVIENDA|CASA)", self.text)
match = Parser.search(r"(porcion de terreno|vivienda|casa)", self.text)
if match:
return match.group()
@property
# @clean_match
def town (self):
match = Parser.search(r"(?:sita|sito|situado|situada) en (((?!(?:\.|\,|\;)).)+)", self.text)
if match:
return Parser.sub(r" *calle.*", "", match.groups()[0])
match = Parser.search(r"(?:PORCION DE TERRENO|VIVIENDA) en (((?!(?:\.|\,|\;)).)+)", self.text)
match = Parser.search(r"(?:porcion de terreno|vivienda) en (((?!(?:\.|\,|\;)).)+)", self.text)
if match:
return Parser.sub(r" *calle.*", "", match.groups()[0])
@property
# @clean_match
def street (self):
if self.qualification and self.qualification.lower() == "rustica":
return None
match = Parser.search(r"(calle|plaza|camino) *(((?!,).)+)", self.text)
if match:
return match.groups()[0] + " " + Parser.sub(r" *numero.*", "", match.groups()[1])
return Parser.decamelize(match.groups()[0]) + " " + Parser.sub(r" *numero.*", "", match.groups()[1])
@property
# @clean_match
def number (self):
if self.qualification and self.qualification.lower() == "rustica":
return None
match = Parser.search(r"(?:número) * (((?!(?:\.|\,|\;)).)+)", self.text)
match = Parser.search(r"(?:numero) * (((?!(?:\.|\,|\;)).)+)", self.text)
if match:
return match.groups()[0]
@property
# @clean_match
def surface (self):
match = Parser.search(r"superficie *(?:total)? *(?:construida|edificada)? * (?:de)? *(((?!(?:\.|\,|\;)).)+)", self.text)
# match = Parser.search(r"(?:(?!superficie.*)superficie) *(?:total)? *(?:construida|edificada)? * (?:de)? *(((?!(?:\.|\,|\;)).)+)", self.text)
match = Parser.search(r"(?<=superficie) ?(?:(?!(metro|area)).)*(metro|area)", self.text)
if match:
return match.groups()[0]
tokens = [token for token in Numeric.tokens() if token != "un"]
sentence = " ".join([Parser.spell(word) for word in match.group().split(" ")])
pos = None
for token in tokens:
if token in sentence:
new_pos = sentence.index(token)
if not pos or pos > new_pos:
pos = new_pos
if pos is None:
return pos
pattern = re.compile("(" + "|".join(tokens) + ").*")
return pattern.match(Parser.sub(r" *(area|metro) *", "", sentence), pos=pos).group().strip() + " (" + match.groups()[1] + ")"
class OwnershipParser (Parser):
@staticmethod
def match (text):
match = Parser.search(r"TITULO *\: *((?!CARGAS).)+", text)
match = Parser.search(r"titulo *\: *((?!cargas).)+", text)
if match:
ownership_body = Parser.sub(r"^TITULO *\: *", "", Parser.sub(r"€+", " ", match.group()))
ownership_body = Parser.sub(r"^titulo *\: *", "", match.group())
return OwnershipParser(ownership_body)
@property
@ -172,51 +258,44 @@ class OwnershipParser (Parser):
return data
@property
# @clean_match
def owner (self):
match = Parser.search(r"Titular\/es\:? *(((?!,).)+)", self.text)
match = Parser.search(r"titular\/es\:? *(((?!,).)+)", self.text)
if match:
return match.groups()[0].strip()
@property
# @clean_match
def nie (self):
match = Parser.search(r"(DNI|CIF).* ([a-zA-Z]?[0-9]{8}[a-zA-Z]?) ", self.text)
match = Parser.search(r"(dni|cif).* (?:-)?([a-zA-Z]?[0-9]{8}[a-zA-Z]?) *", self.text)
if match:
return match.groups()[1].strip()
@property
# @clean_match
def participation (self):
match = Parser.search(r"Participación *\: *(((?!Título).)+)", self.text)
match = Parser.search(r"participacion *\: *(((?!Titulo).)+)", self.text)
if match:
return match.groups()[0].strip()
@property
# @clean_match
def adjudication (self):
match = Parser.search(r"Título *: *(((?!Notario).)+)", self.text)
match = Parser.search(r"titulo *: *(((?!notario).)+)", self.text)
if match:
return match.groups()[0].strip()
@property
# @clean_match
def notary (self):
match = Parser.search(r"Notario(?:\/Autoridad)? *: *(((?!Población).)+) ", self.text)
match = Parser.search(r"notario(?:\/autoridad)? *: *(((?!poblacion).)+) ", self.text)
if match:
return match.groups()[0].strip()
@property
# @clean_match
def town (self):
match = Parser.search(r"Población *: *(((?!Fecha).)+)", self.text)
match = Parser.search(r"poblacion *: *(((?!fecha).)+)", self.text)
if match:
return match.groups()[0].strip()
@property
# @clean_match
def date (self):
match = Parser.search(r"Fecha (?:documento|escritura) *: *((?!Protocolo)[0-9]{2}\/[0-9]{2}\/[0-9]{4})", self.text)
match = Parser.search(r"fecha (?:documento|escritura) *: *((?!protocolo)[0-9]{2}\/[0-9]{2}\/[0-9]{4})", self.text)
if match:
return match.groups()[0].strip()
@ -225,9 +304,9 @@ class ChargesParser (Parser):
@staticmethod
def match (text):
match = Parser.search(r"CARGAS *\: *((?!PRESENTACION).)+", text)
match = Parser.search(r"cargas *\: *((?!presentacion).)+", text)
if match:
charges_body = Parser.sub(r"^CARGAS\: *", "", Parser.sub("€+", " ", match.group()))
charges_body = Parser.sub(r"^cargas\: *", "", match.group())
return ChargesParser(charges_body)
# def get_presentation (text):
@ -243,13 +322,17 @@ class TextParser (object):
__ownership = OwnershipParser()
__charges = ChargesParser()
def __init__ (self, text):
def __init__ (self, text, spell=False):
if spell is True:
pass
# text = Parser.spell(text)
text = Parser.clean_text(text)
self.cover = text
text = secure_replace(self.cover.text, "", text)
text = Parser.sub(re.escape(self.cover.text), "", text)
self.description = text
text = secure_replace(self.cover.text, "", text)
text = Parser.sub(re.escape(self.cover.text), "", text)
self.ownership = text
text = secure_replace(self.ownership.text, "", text)
text = Parser.sub(re.escape(self.ownership.text), "", text)
self.charges = text
@property
@ -329,13 +412,6 @@ class TextParser (object):
return self.text
def secure_replace (pattern, replace, string):
try:
return re.sub(pattern, replace, string)
except Exception as e:
return string
if __name__ == "__main__":
import os

10
server/static/index.html

@ -0,0 +1,10 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Desregistradores</title>
</head>
<body>
<h1>Desregistradores</h1>
</body>
</html>

8
main.py → server/test.py

@ -2,15 +2,15 @@
import json
# SOURCE
from parsers import PdfParser
from parsers.geocoding import GeoLocation
from src.parsers.pdf import PdfParser
from src.parsers.geocoding import GeoLocation
if __name__ == "__main__":
import os
directory = os.path.realpath(os.path.join(os.path.abspath(os.path.dirname(__file__)), "pdfs/T1"))
with open("log.txt", "wt") as conn:
directory = os.path.realpath(os.path.join(os.path.dirname(__file__), *"pdfs/T1".split("/")))
with open(os.path.realpath(os.path.join(os.path.dirname(__file__), *"log/log.txt".split("/"))), "wt") as conn:
for file_name in os.listdir(directory):
file_path = os.path.join(directory, file_name)
parser = PdfParser(file_path)

20
server/wsgi.py

@ -0,0 +1,20 @@
import os
import asyncio
import aiohttp
import aiofiles
dir = os.path.dirname(__file__)
routes = aiohttp.RouteTableDef()
app = aiohttp.web.Application()
app.router.add_static("/static/",
path=os.path.join(dir, "static"),
name="static")
@routes.get("/")
async def index():
async with aiofiles.open(os.path.join(dir, "static", "index.html")) as conn:
return web.Response(text=conn.read(), content_type="text/html")
if __name__ == "__main__":
aiohttp.runapp(app)
Loading…
Cancel
Save