- Replace old implementation of PROCESSOR_TYPES and CATALOG_TYPES with the new cached approach

- Add an ordered_list dynamic field type (to be refined)
- Add tabulator javascript library to project
This commit is contained in:
Josako
2025-05-29 16:00:25 +02:00
parent 8a29eb0d8f
commit 25e169dbea
23 changed files with 687 additions and 199 deletions

View File

@@ -0,0 +1,59 @@
version: "1.0.0"
name: "DOCX Processor"
file_types: "docx"
description: "A processor for DOCX files"
configuration:
chunking_patterns:
name: "Chunking Patterns"
description: "A list of Patterns used to chunk files into logical pieces"
type: "chunking_patterns"
required: false
chunking_heading_level:
name: "Chunking Heading Level"
type: "integer"
description: "Maximum heading level to consider for chunking (1-6)"
required: false
default: 2
extract_comments:
name: "Extract Comments"
type: "boolean"
description: "Whether to include document comments in the markdown"
required: false
default: false
extract_headers_footers:
name: "Extract Headers/Footers"
type: "boolean"
description: "Whether to include headers and footers in the markdown"
required: false
default: false
preserve_formatting:
name: "Preserve Formatting"
type: "boolean"
description: "Whether to preserve bold, italic, and other text formatting"
required: false
default: true
list_style:
name: "List Style"
type: "enum"
description: "How to format lists in markdown"
required: false
default: "dash"
allowed_values: ["dash", "asterisk", "plus"]
image_handling:
name: "Image Handling"
type: "enum"
description: "How to handle embedded images"
required: false
default: "skip"
allowed_values: ["skip", "extract", "placeholder"]
table_alignment:
name: "Table Alignment"
type: "enum"
description: "How to align table contents"
required: false
default: "left"
allowed_values: ["left", "center", "preserve"]
metadata:
author: "System"
date_added: "2023-01-01"
description: "A processor for DOCX files"