You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
59 lines
1.6 KiB
59 lines
1.6 KiB
# BUILT-INS |
|
import os |
|
|
|
# VENDOR |
|
from PyPDF2 import PdfFileReader |
|
|
|
# SOURCE |
|
from .text import TextParser |
|
from .image import ImageParser |
|
|
|
|
|
class PdfParser (object): |
|
|
|
def __init__ (self, file_path): |
|
if not file_path or type(file_path) != str: |
|
raise ValueError("file_path arguments is not a valid type") |
|
elif not os.path.isfile(file_path): |
|
raise FileExistsError("Can't find nothing at the end of the path") |
|
|
|
self.file_path = file_path |
|
self.file_name = os.path.basename(file_path) |
|
self.conn = open(file_path, "rb") |
|
self.parser = PdfFileReader(self.conn) |
|
self.pages = [self.parser.getPage(i) for i in range(self.parser.getNumPages())] |
|
|
|
@property |
|
def format (self): |
|
has_text = bool(self.pages[0].extractText()) |
|
if has_text: |
|
return "str" |
|
else: |
|
return "img" |
|
|
|
@property |
|
def data (self): |
|
if self.format == "str": |
|
text = "" |
|
for page in self.pages: |
|
text += "\n" + page.extractText() |
|
|
|
return TextParser(text) |
|
else: |
|
return TextParser(ImageParser(self.file_path).text) |
|
|
|
def __str__ (self): |
|
return self.data.text |
|
|
|
def __del__ (self): |
|
self.conn.close() |
|
|
|
|
|
if __name__ == "__main__": |
|
directory = os.path.relpath(os.path.join(os.path.abspath(os.path.dirname(__file__)), "../pdfs/T1")) |
|
for file_name in os.listdir(directory): |
|
file_path = os.path.join(directory, file_name) |
|
parser = PdfParser(file_path) |
|
print(file_name.upper()) |
|
print(parser.data.ownership.data) |
|
print()
|
|
|