159 lines
4.2 KiB
Python
159 lines
4.2 KiB
Python
|
from glob import glob
|
||
|
import os
|
||
|
import logging
|
||
|
import shutil
|
||
|
import time
|
||
|
import sys
|
||
|
|
||
|
from PyPDF2 import PdfWriter, PdfReader
|
||
|
|
||
|
from urllib.parse import urljoin
|
||
|
|
||
|
from typing import List
|
||
|
|
||
|
from watchdog.observers import Observer
|
||
|
from watchdog.events import PatternMatchingEventHandler
|
||
|
import tempfile
|
||
|
import requests
|
||
|
from requests.auth import HTTPBasicAuth
|
||
|
|
||
|
# 1pt == 1/72th inch
|
||
|
# 1inch == 2.54cm
|
||
|
PTCM = 1 / 72 * 2.54
|
||
|
|
||
|
# You authenticate via BasicAuth or with a session id.
|
||
|
# We use BasicAuth here
|
||
|
username = os.environ.get("PAPERLESS_USERNAME")
|
||
|
password = os.environ.get("PAPERLESS_PASSWORD")
|
||
|
|
||
|
# Where you have Paperless installed and listening
|
||
|
url = os.environ.get("PAPERLESS_URL")
|
||
|
|
||
|
default_tags = set(os.environ.get("PAPERLESS_DEFAULT_TAGS", "Scan").split(","))
|
||
|
receipe_tags = {"Receipes", "Receipes-Small"}
|
||
|
|
||
|
receipe_trim_y = 0 # 0.5 / PTCM
|
||
|
receipe_width_map = {"receipes": 8.5 / PTCM, "receipes-small": 6 / PTCM}
|
||
|
|
||
|
tag_id_map = {"Scan": 38, "Steffen": 89, "Britta": 88, "Bus": 52, "Wohnen": 40}
|
||
|
|
||
|
tag_type_map = {"Receipes": "Quittung", "Receipes-small": "Quittung"}
|
||
|
document_type_id_map = {"Quittung": 9}
|
||
|
|
||
|
def append_suffix(filename, suffix):
|
||
|
return "{0}_{2}.{1}".format(*filename.rsplit('.', 1) + [suffix])
|
||
|
|
||
|
def crop_width(in_path, new_width):
|
||
|
out_path = append_suffix(in_path, 'cropped')
|
||
|
|
||
|
with open(in_path, "rb") as in_f, open(out_path, 'wb+') as out_f:
|
||
|
input = PdfReader(in_f)
|
||
|
output = PdfWriter()
|
||
|
|
||
|
numPages = input.getNumPages()
|
||
|
|
||
|
for i in range(numPages):
|
||
|
page = input.getPage(i)
|
||
|
|
||
|
width = float(page.mediaBox.getUpperRight_x())
|
||
|
height = float(page.mediaBox.getUpperRight_y())
|
||
|
|
||
|
center = width / 2
|
||
|
|
||
|
page.trimbox.lowerLeft = (center - new_width / 2, 0 + receipe_trim_y)
|
||
|
page.trimbox.upperRight = (center + new_width / 2, height - receipe_trim_y)
|
||
|
|
||
|
page.cropbox = page.trimbox
|
||
|
page.mediabox = page.trimbox
|
||
|
|
||
|
output.addPage(page)
|
||
|
|
||
|
output.write(out_f)
|
||
|
|
||
|
return out_path
|
||
|
|
||
|
|
||
|
def upload_file(path):
|
||
|
logging.info("Uploading: %s", path)
|
||
|
|
||
|
dir = os.path.dirname(path)
|
||
|
tag = os.path.basename(dir)
|
||
|
|
||
|
tags = {tag.title()} | default_tags
|
||
|
types = {tag_type_map[tag] for tag in tags if tag in tag_type_map}
|
||
|
|
||
|
logging.info("Tags: %s", ", ".join(tags))
|
||
|
logging.info("Document types: %s", ", ".join(types))
|
||
|
|
||
|
tag_ids = {tag_id_map[tag] for tag in tags if tag in tag_id_map}
|
||
|
type_ids = {document_type_id_map[typ] for typ in types if typ in document_type_id_map}
|
||
|
|
||
|
if len(tags & receipe_tags) > 0:
|
||
|
old_path = path
|
||
|
logging.info("Cropping receipe...")
|
||
|
path = crop_width(path, receipe_width_map[tag])
|
||
|
|
||
|
os.remove(old_path)
|
||
|
|
||
|
|
||
|
with open(path, "rb") as f:
|
||
|
title = os.path.splitext(os.path.basename(path))[0]
|
||
|
|
||
|
response = requests.post(
|
||
|
url=urljoin(url, "api/documents/post_document/"),
|
||
|
data=[("tags", tag_id) for tag_id in tag_ids] +
|
||
|
[("document_type", type_id) for type_id in type_ids]+
|
||
|
[("title", title)],
|
||
|
files={"document": (title, f, "application/pdf")},
|
||
|
auth=HTTPBasicAuth(username, password),
|
||
|
allow_redirects=False,
|
||
|
)
|
||
|
|
||
|
if response.status_code in [200, 202]:
|
||
|
logging.info("Successful")
|
||
|
|
||
|
os.remove(path)
|
||
|
else:
|
||
|
logging.error("Failed: %d (%s)", response.status_code, response.text)
|
||
|
|
||
|
|
||
|
class Handler(PatternMatchingEventHandler):
|
||
|
def on_closed(self, event):
|
||
|
if not event.is_directory and not event.src_path.endswith("_cropped.pdf"):
|
||
|
upload_file(event.src_path)
|
||
|
|
||
|
|
||
|
def main():
|
||
|
logging.basicConfig(
|
||
|
level=logging.INFO,
|
||
|
format="%(asctime)s - %(message)s",
|
||
|
datefmt="%Y-%m-%d %H:%M:%S",
|
||
|
)
|
||
|
|
||
|
path = sys.argv[1] if len(sys.argv) > 1 else "."
|
||
|
|
||
|
event_handler = Handler(["*.pdf"])
|
||
|
|
||
|
files = glob(f"{path}/**/*.pdf", recursive=True)
|
||
|
|
||
|
logging.info("Initial upload of: %s", files)
|
||
|
|
||
|
for file in files:
|
||
|
upload_file(file)
|
||
|
|
||
|
observer = Observer()
|
||
|
observer.schedule(event_handler, path, recursive=True)
|
||
|
observer.start()
|
||
|
|
||
|
try:
|
||
|
while True:
|
||
|
time.sleep(1)
|
||
|
except KeyboardInterrupt:
|
||
|
observer.stop()
|
||
|
|
||
|
observer.join()
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
main()
|