- Addition of general chunking parameters chunking_heading_level and chunking patterns

- Addition of Processor types docx and markdown
This commit is contained in:
Josako
2024-12-05 15:19:37 +01:00
parent 311927d5ea
commit d35ec9f5ae
17 changed files with 718 additions and 66 deletions

View File

@@ -55,7 +55,6 @@ class Config(object):
# file upload settings
MAX_CONTENT_LENGTH = 50 * 1024 * 1024
UPLOAD_EXTENSIONS = ['.txt', '.pdf', '.png', '.jpg', '.jpeg', '.gif']
# supported languages
SUPPORTED_LANGUAGES = ['en', 'fr', 'nl', 'de', 'es']
@@ -143,10 +142,7 @@ class Config(object):
LANGCHAIN_ENDPOINT = 'https://api.smith.langchain.com'
LANGCHAIN_PROJECT = "eveai"
SUPPORTED_FILE_TYPES = ['pdf', 'html', 'md', 'txt', 'mp3', 'mp4', 'ogg', 'srt']
TENANT_TYPES = ['Active', 'Demo', 'Inactive', 'Test', 'Wordpress Starter']
TENANT_TYPES = ['Active', 'Demo', 'Inactive', 'Test']
# The maximum number of seconds allowed for audio compression (to save resources)
MAX_COMPRESSION_DURATION = 60*10 # 10 minutes

View File

@@ -5,6 +5,19 @@ PROCESSOR_TYPES = {
"file_types": "html",
"Description": "A processor for HTML files",
"configuration": {
"chunking_patterns": {
"name": "Chunking Patterns",
"description": "A list of Patterns used to chunk files into logical pieces",
"type": "chunking_patterns",
"required": False
},
"chunking_heading_level": {
"name": "Chunking Heading Level",
"type": "integer",
"description": "Maximum heading level to consider for chunking (1-6)",
"required": False,
"default": 2
},
"html_tags": {
"name": "HTML Tags",
"type": "string",
@@ -45,7 +58,21 @@ PROCESSOR_TYPES = {
"name": "PDF Processor",
"file_types": "pdf",
"Description": "A Processor for PDF files",
"configuration": {}
"configuration": {
"chunking_patterns": {
"name": "Chunking Patterns",
"description": "A list of Patterns used to chunk files into logical pieces",
"type": "chunking_patterns",
"required": False
},
"chunking_heading_level": {
"name": "Chunking Heading Level",
"type": "integer",
"description": "Maximum heading level to consider for chunking (1-6)",
"required": False,
"default": 2
},
},
},
"AUDIO_PROCESSOR": {
"name": "AUDIO Processor",
@@ -53,4 +80,89 @@ PROCESSOR_TYPES = {
"Description": "A Processor for audio files",
"configuration": {}
},
"MARKDOWN_PROCESSOR": {
"name": "Markdown Processor",
"file_types": "md",
"Description": "A Processor for markdown files",
"configuration": {
"chunking_patterns": {
"name": "Chunking Patterns",
"description": "A list of Patterns used to chunk files into logical pieces",
"type": "chunking_patterns",
"required": False
},
"chunking_heading_level": {
"name": "Chunking Heading Level",
"type": "integer",
"description": "Maximum heading level to consider for chunking (1-6)",
"required": False,
"default": 2
},
}
},
"DOCX_PROCESSOR": {
"name": "DOCX Processor",
"file_types": "docx",
"Description": "A processor for DOCX files",
"configuration": {
"chunking_patterns": {
"name": "Chunking Patterns",
"description": "A list of Patterns used to chunk files into logical pieces",
"type": "chunking_patterns",
"required": False
},
"chunking_heading_level": {
"name": "Chunking Heading Level",
"type": "integer",
"description": "Maximum heading level to consider for chunking (1-6)",
"required": False,
"default": 2
},
"extract_comments": {
"name": "Extract Comments",
"type": "boolean",
"description": "Whether to include document comments in the markdown",
"required": False,
"default": False
},
"extract_headers_footers": {
"name": "Extract Headers/Footers",
"type": "boolean",
"description": "Whether to include headers and footers in the markdown",
"required": False,
"default": False
},
"preserve_formatting": {
"name": "Preserve Formatting",
"type": "boolean",
"description": "Whether to preserve bold, italic, and other text formatting",
"required": False,
"default": True
},
"list_style": {
"name": "List Style",
"type": "enum",
"description": "How to format lists in markdown",
"required": False,
"default": "dash",
"allowed_values": ["dash", "asterisk", "plus"]
},
"image_handling": {
"name": "Image Handling",
"type": "enum",
"description": "How to handle embedded images",
"required": False,
"default": "skip",
"allowed_values": ["skip", "extract", "placeholder"]
},
"table_alignment": {
"name": "Table Alignment",
"type": "enum",
"description": "How to align table contents",
"required": False,
"default": "left",
"allowed_values": ["left", "center", "preserve"]
}
}
}
}