- Replace old implementation of PROCESSOR_TYPES and CATALOG_TYPES with the new cached approach
- Add an ordered_list dynamic field type (to be refined) - Add tabulator javascript library to project
This commit is contained in:
9
config/processors/globals/AUDIO_PROCESSOR/1.0.0.yaml
Normal file
9
config/processors/globals/AUDIO_PROCESSOR/1.0.0.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
version: "1.0.0"
|
||||
name: "AUDIO Processor"
|
||||
file_types: "mp3, mp4, ogg"
|
||||
description: "A Processor for audio files"
|
||||
configuration: {}
|
||||
metadata:
|
||||
author: "System"
|
||||
date_added: "2023-01-01"
|
||||
description: "A Processor for audio files"
|
||||
59
config/processors/globals/DOCX_PROCESSOR/1.0.0.yaml
Normal file
59
config/processors/globals/DOCX_PROCESSOR/1.0.0.yaml
Normal file
@@ -0,0 +1,59 @@
|
||||
version: "1.0.0"
|
||||
name: "DOCX Processor"
|
||||
file_types: "docx"
|
||||
description: "A processor for DOCX files"
|
||||
configuration:
|
||||
chunking_patterns:
|
||||
name: "Chunking Patterns"
|
||||
description: "A list of Patterns used to chunk files into logical pieces"
|
||||
type: "chunking_patterns"
|
||||
required: false
|
||||
chunking_heading_level:
|
||||
name: "Chunking Heading Level"
|
||||
type: "integer"
|
||||
description: "Maximum heading level to consider for chunking (1-6)"
|
||||
required: false
|
||||
default: 2
|
||||
extract_comments:
|
||||
name: "Extract Comments"
|
||||
type: "boolean"
|
||||
description: "Whether to include document comments in the markdown"
|
||||
required: false
|
||||
default: false
|
||||
extract_headers_footers:
|
||||
name: "Extract Headers/Footers"
|
||||
type: "boolean"
|
||||
description: "Whether to include headers and footers in the markdown"
|
||||
required: false
|
||||
default: false
|
||||
preserve_formatting:
|
||||
name: "Preserve Formatting"
|
||||
type: "boolean"
|
||||
description: "Whether to preserve bold, italic, and other text formatting"
|
||||
required: false
|
||||
default: true
|
||||
list_style:
|
||||
name: "List Style"
|
||||
type: "enum"
|
||||
description: "How to format lists in markdown"
|
||||
required: false
|
||||
default: "dash"
|
||||
allowed_values: ["dash", "asterisk", "plus"]
|
||||
image_handling:
|
||||
name: "Image Handling"
|
||||
type: "enum"
|
||||
description: "How to handle embedded images"
|
||||
required: false
|
||||
default: "skip"
|
||||
allowed_values: ["skip", "extract", "placeholder"]
|
||||
table_alignment:
|
||||
name: "Table Alignment"
|
||||
type: "enum"
|
||||
description: "How to align table contents"
|
||||
required: false
|
||||
default: "left"
|
||||
allowed_values: ["left", "center", "preserve"]
|
||||
metadata:
|
||||
author: "System"
|
||||
date_added: "2023-01-01"
|
||||
description: "A processor for DOCX files"
|
||||
49
config/processors/globals/HTML_PROCESSOR/1.0.0.yaml
Normal file
49
config/processors/globals/HTML_PROCESSOR/1.0.0.yaml
Normal file
@@ -0,0 +1,49 @@
|
||||
version: "1.0.0"
|
||||
name: "HTML Processor"
|
||||
file_types: "html"
|
||||
description: "A processor for HTML files"
|
||||
configuration:
|
||||
chunking_patterns:
|
||||
name: "Chunking Patterns"
|
||||
description: "A list of Patterns used to chunk files into logical pieces"
|
||||
type: "chunking_patterns"
|
||||
required: false
|
||||
chunking_heading_level:
|
||||
name: "Chunking Heading Level"
|
||||
type: "integer"
|
||||
description: "Maximum heading level to consider for chunking (1-6)"
|
||||
required: false
|
||||
default: 2
|
||||
html_tags:
|
||||
name: "HTML Tags"
|
||||
type: "string"
|
||||
description: "A comma-separated list of HTML tags"
|
||||
required: true
|
||||
default: "p, h1, h2, h3, h4, h5, h6, li, table, thead, tbody, tr, td"
|
||||
html_end_tags:
|
||||
name: "HTML End Tags"
|
||||
type: "string"
|
||||
description: "A comma-separated list of HTML end tags (where can the chunk end)"
|
||||
required: true
|
||||
default: "p, li, table"
|
||||
html_included_elements:
|
||||
name: "HTML Included Elements"
|
||||
type: "string"
|
||||
description: "A comma-separated list of elements to be included"
|
||||
required: true
|
||||
default: "article, main"
|
||||
html_excluded_elements:
|
||||
name: "HTML Excluded Elements"
|
||||
type: "string"
|
||||
description: "A comma-separated list of elements to be excluded"
|
||||
required: false
|
||||
default: "header, footer, nav, script"
|
||||
html_excluded_classes:
|
||||
name: "HTML Excluded Classes"
|
||||
type: "string"
|
||||
description: "A comma-separated list of classes to be excluded"
|
||||
required: false
|
||||
metadata:
|
||||
author: "System"
|
||||
date_added: "2023-01-01"
|
||||
description: "A processor for HTML files"
|
||||
20
config/processors/globals/MARKDOWN_PROCESSOR/1.0.0.yaml
Normal file
20
config/processors/globals/MARKDOWN_PROCESSOR/1.0.0.yaml
Normal file
@@ -0,0 +1,20 @@
|
||||
version: "1.0.0"
|
||||
name: "Markdown Processor"
|
||||
file_types: "md"
|
||||
description: "A Processor for markdown files"
|
||||
configuration:
|
||||
chunking_patterns:
|
||||
name: "Chunking Patterns"
|
||||
description: "A list of Patterns used to chunk files into logical pieces"
|
||||
type: "chunking_patterns"
|
||||
required: false
|
||||
chunking_heading_level:
|
||||
name: "Chunking Heading Level"
|
||||
type: "integer"
|
||||
description: "Maximum heading level to consider for chunking (1-6)"
|
||||
required: false
|
||||
default: 2
|
||||
metadata:
|
||||
author: "System"
|
||||
date_added: "2023-01-01"
|
||||
description: "A Processor for markdown files"
|
||||
20
config/processors/globals/PDF_PROCESSOR/1.0.0.yaml
Normal file
20
config/processors/globals/PDF_PROCESSOR/1.0.0.yaml
Normal file
@@ -0,0 +1,20 @@
|
||||
version: "1.0.0"
|
||||
name: "PDF Processor"
|
||||
file_types: "pdf"
|
||||
description: "A Processor for PDF files"
|
||||
configuration:
|
||||
chunking_patterns:
|
||||
name: "Chunking Patterns"
|
||||
description: "A list of Patterns used to chunk files into logical pieces"
|
||||
type: "chunking_patterns"
|
||||
required: false
|
||||
chunking_heading_level:
|
||||
name: "Chunking Heading Level"
|
||||
type: "integer"
|
||||
description: "Maximum heading level to consider for chunking (1-6)"
|
||||
required: false
|
||||
default: 2
|
||||
metadata:
|
||||
author: "System"
|
||||
date_added: "2023-01-01"
|
||||
description: "A Processor for PDF files"
|
||||
Reference in New Issue
Block a user