- Replace old implementation of PROCESSOR_TYPES and CATALOG_TYPES with the new cached approach

- Add an ordered_list dynamic field type (to be refined)
- Add tabulator javascript library to project
This commit is contained in:
Josako
2025-05-29 16:00:25 +02:00
parent 8a29eb0d8f
commit 25e169dbea
23 changed files with 687 additions and 199 deletions

View File

@@ -0,0 +1,21 @@
version: "1.0.0"
name: "Dossier Catalog"
description: "A Catalog with information in Evie's Library in which several Dossiers can be stored"
configuration:
tagging_fields:
name: "Tagging Fields"
type: "tagging_fields"
description: "Define the metadata fields that will be used for tagging documents.
Each field must have:
- type: one of 'string', 'integer', 'float', 'date', 'enum'
- required: boolean indicating if the field is mandatory
- description: field description
- allowed_values: list of values (for enum type only)
- min_value/max_value: range limits (for numeric types only)"
required: true
default: {}
document_version_configurations: ["tagging_fields"]
metadata:
author: "System"
date_added: "2023-01-01"
description: "A Catalog with information in Evie's Library in which several Dossiers can be stored"

View File

@@ -0,0 +1,9 @@
version: "1.0.0"
name: "Standard Catalog"
description: "A Catalog with information in Evie's Library, to be considered as a whole"
configuration: {}
document_version_configurations: []
metadata:
author: "System"
date_added: "2023-01-01"
description: "A Catalog with information in Evie's Library, to be considered as a whole"

View File

@@ -0,0 +1,9 @@
version: "1.0.0"
name: "AUDIO Processor"
file_types: "mp3, mp4, ogg"
description: "A Processor for audio files"
configuration: {}
metadata:
author: "System"
date_added: "2023-01-01"
description: "A Processor for audio files"

View File

@@ -0,0 +1,59 @@
version: "1.0.0"
name: "DOCX Processor"
file_types: "docx"
description: "A processor for DOCX files"
configuration:
chunking_patterns:
name: "Chunking Patterns"
description: "A list of Patterns used to chunk files into logical pieces"
type: "chunking_patterns"
required: false
chunking_heading_level:
name: "Chunking Heading Level"
type: "integer"
description: "Maximum heading level to consider for chunking (1-6)"
required: false
default: 2
extract_comments:
name: "Extract Comments"
type: "boolean"
description: "Whether to include document comments in the markdown"
required: false
default: false
extract_headers_footers:
name: "Extract Headers/Footers"
type: "boolean"
description: "Whether to include headers and footers in the markdown"
required: false
default: false
preserve_formatting:
name: "Preserve Formatting"
type: "boolean"
description: "Whether to preserve bold, italic, and other text formatting"
required: false
default: true
list_style:
name: "List Style"
type: "enum"
description: "How to format lists in markdown"
required: false
default: "dash"
allowed_values: ["dash", "asterisk", "plus"]
image_handling:
name: "Image Handling"
type: "enum"
description: "How to handle embedded images"
required: false
default: "skip"
allowed_values: ["skip", "extract", "placeholder"]
table_alignment:
name: "Table Alignment"
type: "enum"
description: "How to align table contents"
required: false
default: "left"
allowed_values: ["left", "center", "preserve"]
metadata:
author: "System"
date_added: "2023-01-01"
description: "A processor for DOCX files"

View File

@@ -0,0 +1,49 @@
version: "1.0.0"
name: "HTML Processor"
file_types: "html"
description: "A processor for HTML files"
configuration:
chunking_patterns:
name: "Chunking Patterns"
description: "A list of Patterns used to chunk files into logical pieces"
type: "chunking_patterns"
required: false
chunking_heading_level:
name: "Chunking Heading Level"
type: "integer"
description: "Maximum heading level to consider for chunking (1-6)"
required: false
default: 2
html_tags:
name: "HTML Tags"
type: "string"
description: "A comma-separated list of HTML tags"
required: true
default: "p, h1, h2, h3, h4, h5, h6, li, table, thead, tbody, tr, td"
html_end_tags:
name: "HTML End Tags"
type: "string"
description: "A comma-separated list of HTML end tags (where can the chunk end)"
required: true
default: "p, li, table"
html_included_elements:
name: "HTML Included Elements"
type: "string"
description: "A comma-separated list of elements to be included"
required: true
default: "article, main"
html_excluded_elements:
name: "HTML Excluded Elements"
type: "string"
description: "A comma-separated list of elements to be excluded"
required: false
default: "header, footer, nav, script"
html_excluded_classes:
name: "HTML Excluded Classes"
type: "string"
description: "A comma-separated list of classes to be excluded"
required: false
metadata:
author: "System"
date_added: "2023-01-01"
description: "A processor for HTML files"

View File

@@ -0,0 +1,20 @@
version: "1.0.0"
name: "Markdown Processor"
file_types: "md"
description: "A Processor for markdown files"
configuration:
chunking_patterns:
name: "Chunking Patterns"
description: "A list of Patterns used to chunk files into logical pieces"
type: "chunking_patterns"
required: false
chunking_heading_level:
name: "Chunking Heading Level"
type: "integer"
description: "Maximum heading level to consider for chunking (1-6)"
required: false
default: 2
metadata:
author: "System"
date_added: "2023-01-01"
description: "A Processor for markdown files"

View File

@@ -0,0 +1,20 @@
version: "1.0.0"
name: "PDF Processor"
file_types: "pdf"
description: "A Processor for PDF files"
configuration:
chunking_patterns:
name: "Chunking Patterns"
description: "A list of Patterns used to chunk files into logical pieces"
type: "chunking_patterns"
required: false
chunking_heading_level:
name: "Chunking Heading Level"
type: "integer"
description: "Maximum heading level to consider for chunking (1-6)"
required: false
default: 2
metadata:
author: "System"
date_added: "2023-01-01"
description: "A Processor for PDF files"

View File

@@ -2,28 +2,10 @@
CATALOG_TYPES = {
"STANDARD_CATALOG": {
"name": "Standard Catalog",
"Description": "A Catalog with information in Evie's Library, to be considered as a whole",
"configuration": {},
"document_version_configurations": []
"description": "A Catalog with information in Evie's Library, to be considered as a whole",
},
"DOSSIER_CATALOG": {
"name": "Dossier Catalog",
"Description": "A Catalog with information in Evie's Library in which several Dossiers can be stored",
"configuration": {
"tagging_fields": {
"name": "Tagging Fields",
"type": "tagging_fields",
"description": """Define the metadata fields that will be used for tagging documents.
Each field must have:
- type: one of 'string', 'integer', 'float', 'date', 'enum'
- required: boolean indicating if the field is mandatory
- description: field description
- allowed_values: list of values (for enum type only)
- min_value/max_value: range limits (for numeric types only)""",
"required": True,
"default": {},
}
},
"document_version_configurations": ["tagging_fields"]
"description": "A Catalog with information in Evie's Library in which several Dossiers can be stored",
},
}

View File

@@ -1,168 +1,28 @@
# Catalog Types
# Processor Types
PROCESSOR_TYPES = {
"HTML_PROCESSOR": {
"name": "HTML Processor",
"description": "A processor for HTML files",
"file_types": "html",
"Description": "A processor for HTML files",
"configuration": {
"chunking_patterns": {
"name": "Chunking Patterns",
"description": "A list of Patterns used to chunk files into logical pieces",
"type": "chunking_patterns",
"required": False
},
"chunking_heading_level": {
"name": "Chunking Heading Level",
"type": "integer",
"description": "Maximum heading level to consider for chunking (1-6)",
"required": False,
"default": 2
},
"html_tags": {
"name": "HTML Tags",
"type": "string",
"description": "A comma-separated list of HTML tags",
"required": True,
"default": "p, h1, h2, h3, h4, h5, h6, li, table, thead, tbody, tr, td"
},
"html_end_tags": {
"name": "HTML End Tags",
"type": "string",
"description": "A comma-separated list of HTML end tags (where can the chunk end)",
"required": True,
"default": "p, li, table"
},
"html_included_elements": {
"name": "HTML Included Elements",
"type": "string",
"description": "A comma-separated list of elements to be included",
"required": True,
"default": "article, main"
},
"html_excluded_elements": {
"name": "HTML Excluded Elements",
"type": "string",
"description": "A comma-separated list of elements to be excluded",
"required": False,
"default": "header, footer, nav, script"
},
"html_excluded_classes": {
"name": "HTML Excluded Classes",
"type": "string",
"description": "A comma-separated list of classes to be excluded",
"required": False,
},
},
},
"PDF_PROCESSOR": {
"name": "PDF Processor",
"description": "A Processor for PDF files",
"file_types": "pdf",
"Description": "A Processor for PDF files",
"configuration": {
"chunking_patterns": {
"name": "Chunking Patterns",
"description": "A list of Patterns used to chunk files into logical pieces",
"type": "chunking_patterns",
"required": False
},
"chunking_heading_level": {
"name": "Chunking Heading Level",
"type": "integer",
"description": "Maximum heading level to consider for chunking (1-6)",
"required": False,
"default": 2
},
},
},
"AUDIO_PROCESSOR": {
"name": "AUDIO Processor",
"description": "A Processor for audio files",
"file_types": "mp3, mp4, ogg",
"Description": "A Processor for audio files",
"configuration": {}
},
"MARKDOWN_PROCESSOR": {
"name": "Markdown Processor",
"description": "A Processor for markdown files",
"file_types": "md",
"Description": "A Processor for markdown files",
"configuration": {
"chunking_patterns": {
"name": "Chunking Patterns",
"description": "A list of Patterns used to chunk files into logical pieces",
"type": "chunking_patterns",
"required": False
},
"chunking_heading_level": {
"name": "Chunking Heading Level",
"type": "integer",
"description": "Maximum heading level to consider for chunking (1-6)",
"required": False,
"default": 2
},
}
},
"DOCX_PROCESSOR": {
"name": "DOCX Processor",
"description": "A processor for DOCX files",
"file_types": "docx",
"Description": "A processor for DOCX files",
"configuration": {
"chunking_patterns": {
"name": "Chunking Patterns",
"description": "A list of Patterns used to chunk files into logical pieces",
"type": "chunking_patterns",
"required": False
},
"chunking_heading_level": {
"name": "Chunking Heading Level",
"type": "integer",
"description": "Maximum heading level to consider for chunking (1-6)",
"required": False,
"default": 2
},
"extract_comments": {
"name": "Extract Comments",
"type": "boolean",
"description": "Whether to include document comments in the markdown",
"required": False,
"default": False
},
"extract_headers_footers": {
"name": "Extract Headers/Footers",
"type": "boolean",
"description": "Whether to include headers and footers in the markdown",
"required": False,
"default": False
},
"preserve_formatting": {
"name": "Preserve Formatting",
"type": "boolean",
"description": "Whether to preserve bold, italic, and other text formatting",
"required": False,
"default": True
},
"list_style": {
"name": "List Style",
"type": "enum",
"description": "How to format lists in markdown",
"required": False,
"default": "dash",
"allowed_values": ["dash", "asterisk", "plus"]
},
"image_handling": {
"name": "Image Handling",
"type": "enum",
"description": "How to handle embedded images",
"required": False,
"default": "skip",
"allowed_values": ["skip", "extract", "placeholder"]
},
"table_alignment": {
"name": "Table Alignment",
"type": "enum",
"description": "How to align table contents",
"required": False,
"default": "left",
"allowed_values": ["left", "center", "preserve"]
}
}
}
}

View File

@@ -16,5 +16,9 @@ SPECIALIST_TYPES = {
"name": "Traicie Role Definition Specialist",
"description": "Assistant Defining Competencies and KO Criteria",
"partner": "traicie"
},
"TRAICIE_SELECTION_SPECIALIST": {
"name": "Traicie Selection Specialist",
"description": "Recruitment Selection Assistant",
}
}