diff --git a/.gitignore b/.gitignore index 60171de..25b07c9 100644 --- a/.gitignore +++ b/.gitignore @@ -131,3 +131,5 @@ dmypy.json # Pyre type checker .pyre/ + +.idea diff --git a/paperlessngx_postprocessor/paperless_api.py b/paperlessngx_postprocessor/paperless_api.py index 266fc0b..184e280 100644 --- a/paperlessngx_postprocessor/paperless_api.py +++ b/paperlessngx_postprocessor/paperless_api.py @@ -24,12 +24,28 @@ def __init__(self, api_url, auth_token, paperless_src_dir, logger=None): self._auth_token = auth_token self._cache = {} - self._cachable_types = ["correspondents", "document_types", "storage_paths", "tags"] + self._cachable_types = ["correspondents", "document_types", "storage_paths", "tags", "custom_fields"] self._paperless_api_version = 3 self._common_headers = {"Authorization": f"Token {self._auth_token}", + "Content-Type": "application/json", "Accept": f"application/json; version={self._paperless_api_version}"} + def _get_custom_fields(self): + return self._get_list("custom_fields") + + def get_custom_field_by_name(self, search_name): + self._get_custom_fields() + + search_result = [custom_field for custom_field in self._cache["custom_fields"] if custom_field["name"].lower() == search_name.lower().replace("_", " ")] + search_result = search_result[0] if search_result else None + + if search_result: + return search_result + else: + self._logger.debug(f"Custom field with name {search_name} cannot be found.") + return {} + def delete_document_by_id(self, document_id): item_type = "documents" item_id = document_id @@ -94,7 +110,7 @@ def get_item_id_by_name(self, item_type, item_name): def patch_document(self, document_id, data): response = requests.patch(f"{self._api_url}/documents/{document_id}/", headers = self._common_headers, - data = data) + json = data) if not response.ok: self._log_request_error(response) return response @@ -179,6 +195,9 @@ def get_storage_path_by_id(self, storage_path_id): def get_tag_by_id(self, tag_id): return self._get_item_by_id("tags", tag_id) + def get_custom_field_by_id(self, custom_field_id): + return self._get_item_by_id("custom_fields", custom_field_id) + def get_metadata_in_filename_format(self, metadata): new_metadata = {} new_metadata["document_id"] = metadata["id"] @@ -202,7 +221,8 @@ def get_metadata_in_filename_format(self, metadata): new_metadata["added_day"] = f"{added_date.day:02d}" new_metadata["added_date"] = added_date.strftime("%F") new_metadata["added_date_object"] = added_date - + new_metadata["custom_fields"] = metadata["custom_fields"] + return new_metadata def get_metadata_from_filename_format(self, metadata_in_filename_format): @@ -217,7 +237,8 @@ def get_metadata_from_filename_format(self, metadata_in_filename_format): #result["created"] = metadata_in_filename_format["created"] result["created_date"] = dateutil.parser.isoparse(metadata_in_filename_format["created"]).strftime("%F") result["added"] = metadata_in_filename_format["added"] - + result["custom_fields"] = metadata_in_filename_format["custom_fields"] + return result def get_metadata_for_post_consume_script(self, document_id): diff --git a/paperlessngx_postprocessor/postprocessor.py b/paperlessngx_postprocessor/postprocessor.py index 735e5d9..bf752cc 100644 --- a/paperlessngx_postprocessor/postprocessor.py +++ b/paperlessngx_postprocessor/postprocessor.py @@ -1,4 +1,5 @@ import calendar +import copy import dateutil.parser import jinja2 import logging @@ -22,6 +23,8 @@ def __init__(self, api, spec, logger = None): self._match = spec[self.name].get("match") self._metadata_regex = spec[self.name].get("metadata_regex") self._metadata_postprocessing = spec[self.name].get("metadata_postprocessing") + #self._custom_fields_regex = spec[self.name].get("custom_fields_regex") + #self._custom_fields_postprocessing = spec[self.name].get("custom_fields_postprocessing") self._validation_rule = spec[self.name].get("validation_rule") #self._title_format = spec[self.name].get("title_format") @@ -138,7 +141,7 @@ def _jinja_filter_regex_sub(self, string, pattern, repl): return regex.sub(pattern, repl, string) def _normalize_created_dates(self, new_metadata, old_metadata): - result = new_metadata.copy() + result = copy.deepcopy(new_metadata) try: result["created_year"] = str(int(new_metadata["created_year"])) except: @@ -186,17 +189,27 @@ def get_new_metadata(self, metadata, content): read_only_metadata = {key: metadata[key] for key in read_only_metadata_keys if key in metadata} writable_metadata_keys = list(set(metadata.keys()) - set(read_only_metadata_keys)) writable_metadata = {key: metadata[key] for key in writable_metadata_keys if key in metadata} - + # Extract the regex_data if self._metadata_regex is not None: - match_object = regex.search(self._metadata_regex, content) - if match_object is not None: - regex_data = match_object.groupdict() - #writable_metadata.update(match_object.groupdict()) + matches = regex.finditer(self._metadata_regex, content) + # Iterate over all matches and merge it + regex_data = {} + for match_object in matches: + if match_object is not None: + current_groups = match_object.groupdict() + for group_name, value in current_groups.items(): + if group_name in regex_data and value not in regex_data[group_name]: + regex_data[group_name] = f"{regex_data[group_name]},{value}" + else: + regex_data[group_name] = value + + # Process all merged matches at once + if regex_data: # Only process if we found any matches writable_metadata.update([(k, regex_data[k]) for k in regex_data if regex_data[k] is not None]) writable_metadata = self._normalize_created_dates(writable_metadata, metadata) self._logger.debug(f"Regex results are {writable_metadata}") - else: + else: # No matches found self._logger.warning(f"Regex '{self._metadata_regex}' for '{self.name}' didn't match for document_id={metadata['document_id']}") # Cycle throguh the postprocessing rules @@ -205,8 +218,20 @@ def get_new_metadata(self, metadata, content): try: old_value = writable_metadata.get(variable_name) merged_metadata = {**writable_metadata, **read_only_metadata} - template = self._env.from_string(self._metadata_postprocessing[variable_name]) - writable_metadata[variable_name] = template.render(**merged_metadata) + + if variable_name != "custom_fields": + template = self._env.from_string(self._metadata_postprocessing[variable_name]) + writable_metadata[variable_name] = template.render(**merged_metadata) + elif variable_name == "custom_fields": + for custom_field_name in ( + self._metadata_postprocessing["custom_fields"] + ).keys(): + custom_field_definition = self._api.get_custom_field_by_name(custom_field_name) + for index, custom_field_metadata_iterate in enumerate(writable_metadata["custom_fields"]): + if custom_field_definition and custom_field_metadata_iterate["field"] == custom_field_definition["id"]: + template = self._env.from_string(self._metadata_postprocessing[variable_name][custom_field_name]) + writable_metadata[variable_name][index]["value"] = template.render(**merged_metadata) + writable_metadata = self._normalize_created_dates(writable_metadata, metadata) self._logger.debug(f"Updating '{variable_name}' using template {self._metadata_postprocessing[variable_name]} and metadata {merged_metadata}\n: '{old_value}'->'{writable_metadata[variable_name]}'") except Exception as e: @@ -257,7 +282,7 @@ def __init__(self, api, rules_dir, postprocessing_tag = None, invalid_tag = None def _get_new_metadata_in_filename_format(self, metadata_in_filename_format, content): - new_metadata = metadata_in_filename_format.copy() + new_metadata = copy.deepcopy(metadata_in_filename_format) for processor in self._processors: if processor.matches(metadata_in_filename_format): diff --git a/rulesets.d/example.yml b/rulesets.d/example.yml index 22a251a..520665a 100644 --- a/rulesets.d/example.yml +++ b/rulesets.d/example.yml @@ -25,3 +25,11 @@ Parse creation date from filename: created_month: '{{ title_old | regex_sub("^(?P\d{4})-(?P\d{2})-(?P\d{2}) (?P.*)$", "\g<created_month>") }}' created_day: '{{ title_old | regex_sub("^(?P<created_year>\d{4})-(?P<created_month>\d{2})-(?P<created_day>\d{2}) (?P<title>.*)$", "\g<created_day>") }}' validation_rule: '{{ num_documents(correspondent=correspondent, document_type=document_type, created_date_object=created_date_object) == 1 }}' +--- +Ruleset for Custom Field: + match: True + metadata_regex: 'Eingegangen (?P<entry_day>\d{1,2}).(?P<entry_month>\d{1,2}).(?P<entry_year>\d{4})' + metadata_postprocessing: + title: "Test Custom Fields Functionality" + custom_fields: + Eingegangen: '{{entry_year}}-{{entry_month}}-{{entry_day}}'