add script to upload scans to Paperless

2022-08-20 00:04:14 +02:00 · 2022-08-20 00:04:14 +02:00 · 7f231c2e5b
commit 7f231c2e5b
parent 309159fc91
1 changed files with 158 additions and 0 deletions
--- a/python/paperless-uploader.py
+++ b/python/paperless-uploader.py
@ -0,0 +1,158 @@
+from glob import glob
+import os
+import logging
+import shutil
+import time
+import sys
+
+from PyPDF2 import PdfWriter, PdfReader
+
+from urllib.parse import urljoin
+
+from typing import List
+
+from watchdog.observers import Observer
+from watchdog.events import PatternMatchingEventHandler
+import tempfile
+import requests
+from requests.auth import HTTPBasicAuth
+
+# 1pt == 1/72th inch
+# 1inch == 2.54cm
+PTCM = 1 / 72 * 2.54
+
+# You authenticate via BasicAuth or with a session id.
+# We use BasicAuth here
+username = os.environ.get("PAPERLESS_USERNAME")
+password = os.environ.get("PAPERLESS_PASSWORD")
+
+# Where you have Paperless installed and listening
+url = os.environ.get("PAPERLESS_URL")
+
+default_tags = set(os.environ.get("PAPERLESS_DEFAULT_TAGS", "Scan").split(","))
+receipe_tags = {"Receipes", "Receipes-Small"}
+
+receipe_trim_y = 0 # 0.5 / PTCM
+receipe_width_map = {"receipes": 8.5 / PTCM, "receipes-small": 6 / PTCM}
+
+tag_id_map = {"Scan": 38, "Steffen": 89, "Britta": 88, "Bus": 52, "Wohnen": 40}
+
+tag_type_map = {"Receipes": "Quittung", "Receipes-small": "Quittung"}
+document_type_id_map = {"Quittung": 9}
+
+def append_suffix(filename, suffix):
+    return "{0}_{2}.{1}".format(*filename.rsplit('.', 1) + [suffix])
+
+def crop_width(in_path, new_width):
+    out_path = append_suffix(in_path, 'cropped')
+
+    with open(in_path, "rb") as in_f, open(out_path, 'wb+') as out_f:
+        input = PdfReader(in_f)
+        output = PdfWriter()
+
+        numPages = input.getNumPages()
+
+        for i in range(numPages):
+            page = input.getPage(i)
+
+            width = float(page.mediaBox.getUpperRight_x())
+            height = float(page.mediaBox.getUpperRight_y())
+
+            center = width / 2
+
+            page.trimbox.lowerLeft = (center - new_width / 2, 0 + receipe_trim_y)
+            page.trimbox.upperRight = (center + new_width / 2, height - receipe_trim_y)
+
+            page.cropbox = page.trimbox
+            page.mediabox = page.trimbox
+
+            output.addPage(page)
+
+        output.write(out_f)
+
+    return out_path
+
+
+def upload_file(path):
+    logging.info("Uploading: %s", path)
+
+    dir = os.path.dirname(path)
+    tag = os.path.basename(dir)
+
+    tags = {tag.title()} | default_tags
+    types = {tag_type_map[tag] for tag in tags if tag in tag_type_map}
+
+    logging.info("Tags: %s", ", ".join(tags))
+    logging.info("Document types: %s", ", ".join(types))
+
+    tag_ids = {tag_id_map[tag] for tag in tags if tag in tag_id_map}
+    type_ids = {document_type_id_map[typ] for typ in types if typ in document_type_id_map}
+
+    if len(tags & receipe_tags) > 0:
+        old_path = path
+        logging.info("Cropping receipe...")
+        path = crop_width(path, receipe_width_map[tag])
+
+        os.remove(old_path)
+
+
+    with open(path, "rb") as f:
+        title = os.path.splitext(os.path.basename(path))[0]
+
+        response = requests.post(
+            url=urljoin(url, "api/documents/post_document/"),
+            data=[("tags", tag_id) for tag_id in tag_ids] +
+                 [("document_type", type_id) for type_id in type_ids]+
+                 [("title", title)],
+            files={"document": (title, f, "application/pdf")},
+            auth=HTTPBasicAuth(username, password),
+            allow_redirects=False,
+        )
+
+        if response.status_code in [200, 202]:
+            logging.info("Successful")
+
+            os.remove(path)
+        else:
+            logging.error("Failed: %d (%s)", response.status_code, response.text)
+
+
+class Handler(PatternMatchingEventHandler):
+    def on_closed(self, event):
+        if not event.is_directory and not event.src_path.endswith("_cropped.pdf"):
+            upload_file(event.src_path)
+
+
+def main():
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+
+    path = sys.argv[1] if len(sys.argv) > 1 else "."
+
+    event_handler = Handler(["*.pdf"])
+
+    files = glob(f"{path}/**/*.pdf", recursive=True)
+
+    logging.info("Initial upload of: %s", files)
+
+    for file in files:
+        upload_file(file)
+
+    observer = Observer()
+    observer.schedule(event_handler, path, recursive=True)
+    observer.start()
+
+    try:
+        while True:
+            time.sleep(1)
+    except KeyboardInterrupt:
+        observer.stop()
+
+    observer.join()
+
+
+if __name__ == "__main__":
+    main()