- Addition of general chunking parameters chunking_heading_level and chunking patterns
- Addition of Processor types docx and markdown
This commit is contained in:
@@ -55,7 +55,6 @@ class Config(object):
|
||||
|
||||
# file upload settings
|
||||
MAX_CONTENT_LENGTH = 50 * 1024 * 1024
|
||||
UPLOAD_EXTENSIONS = ['.txt', '.pdf', '.png', '.jpg', '.jpeg', '.gif']
|
||||
|
||||
# supported languages
|
||||
SUPPORTED_LANGUAGES = ['en', 'fr', 'nl', 'de', 'es']
|
||||
@@ -143,10 +142,7 @@ class Config(object):
|
||||
LANGCHAIN_ENDPOINT = 'https://api.smith.langchain.com'
|
||||
LANGCHAIN_PROJECT = "eveai"
|
||||
|
||||
|
||||
SUPPORTED_FILE_TYPES = ['pdf', 'html', 'md', 'txt', 'mp3', 'mp4', 'ogg', 'srt']
|
||||
|
||||
TENANT_TYPES = ['Active', 'Demo', 'Inactive', 'Test', 'Wordpress Starter']
|
||||
TENANT_TYPES = ['Active', 'Demo', 'Inactive', 'Test']
|
||||
|
||||
# The maximum number of seconds allowed for audio compression (to save resources)
|
||||
MAX_COMPRESSION_DURATION = 60*10 # 10 minutes
|
||||
|
||||
@@ -5,6 +5,19 @@ PROCESSOR_TYPES = {
|
||||
"file_types": "html",
|
||||
"Description": "A processor for HTML files",
|
||||
"configuration": {
|
||||
"chunking_patterns": {
|
||||
"name": "Chunking Patterns",
|
||||
"description": "A list of Patterns used to chunk files into logical pieces",
|
||||
"type": "chunking_patterns",
|
||||
"required": False
|
||||
},
|
||||
"chunking_heading_level": {
|
||||
"name": "Chunking Heading Level",
|
||||
"type": "integer",
|
||||
"description": "Maximum heading level to consider for chunking (1-6)",
|
||||
"required": False,
|
||||
"default": 2
|
||||
},
|
||||
"html_tags": {
|
||||
"name": "HTML Tags",
|
||||
"type": "string",
|
||||
@@ -45,7 +58,21 @@ PROCESSOR_TYPES = {
|
||||
"name": "PDF Processor",
|
||||
"file_types": "pdf",
|
||||
"Description": "A Processor for PDF files",
|
||||
"configuration": {}
|
||||
"configuration": {
|
||||
"chunking_patterns": {
|
||||
"name": "Chunking Patterns",
|
||||
"description": "A list of Patterns used to chunk files into logical pieces",
|
||||
"type": "chunking_patterns",
|
||||
"required": False
|
||||
},
|
||||
"chunking_heading_level": {
|
||||
"name": "Chunking Heading Level",
|
||||
"type": "integer",
|
||||
"description": "Maximum heading level to consider for chunking (1-6)",
|
||||
"required": False,
|
||||
"default": 2
|
||||
},
|
||||
},
|
||||
},
|
||||
"AUDIO_PROCESSOR": {
|
||||
"name": "AUDIO Processor",
|
||||
@@ -53,4 +80,89 @@ PROCESSOR_TYPES = {
|
||||
"Description": "A Processor for audio files",
|
||||
"configuration": {}
|
||||
},
|
||||
"MARKDOWN_PROCESSOR": {
|
||||
"name": "Markdown Processor",
|
||||
"file_types": "md",
|
||||
"Description": "A Processor for markdown files",
|
||||
"configuration": {
|
||||
"chunking_patterns": {
|
||||
"name": "Chunking Patterns",
|
||||
"description": "A list of Patterns used to chunk files into logical pieces",
|
||||
"type": "chunking_patterns",
|
||||
"required": False
|
||||
},
|
||||
"chunking_heading_level": {
|
||||
"name": "Chunking Heading Level",
|
||||
"type": "integer",
|
||||
"description": "Maximum heading level to consider for chunking (1-6)",
|
||||
"required": False,
|
||||
"default": 2
|
||||
},
|
||||
}
|
||||
},
|
||||
"DOCX_PROCESSOR": {
|
||||
"name": "DOCX Processor",
|
||||
"file_types": "docx",
|
||||
"Description": "A processor for DOCX files",
|
||||
"configuration": {
|
||||
"chunking_patterns": {
|
||||
"name": "Chunking Patterns",
|
||||
"description": "A list of Patterns used to chunk files into logical pieces",
|
||||
"type": "chunking_patterns",
|
||||
"required": False
|
||||
},
|
||||
"chunking_heading_level": {
|
||||
"name": "Chunking Heading Level",
|
||||
"type": "integer",
|
||||
"description": "Maximum heading level to consider for chunking (1-6)",
|
||||
"required": False,
|
||||
"default": 2
|
||||
},
|
||||
"extract_comments": {
|
||||
"name": "Extract Comments",
|
||||
"type": "boolean",
|
||||
"description": "Whether to include document comments in the markdown",
|
||||
"required": False,
|
||||
"default": False
|
||||
},
|
||||
"extract_headers_footers": {
|
||||
"name": "Extract Headers/Footers",
|
||||
"type": "boolean",
|
||||
"description": "Whether to include headers and footers in the markdown",
|
||||
"required": False,
|
||||
"default": False
|
||||
},
|
||||
"preserve_formatting": {
|
||||
"name": "Preserve Formatting",
|
||||
"type": "boolean",
|
||||
"description": "Whether to preserve bold, italic, and other text formatting",
|
||||
"required": False,
|
||||
"default": True
|
||||
},
|
||||
"list_style": {
|
||||
"name": "List Style",
|
||||
"type": "enum",
|
||||
"description": "How to format lists in markdown",
|
||||
"required": False,
|
||||
"default": "dash",
|
||||
"allowed_values": ["dash", "asterisk", "plus"]
|
||||
},
|
||||
"image_handling": {
|
||||
"name": "Image Handling",
|
||||
"type": "enum",
|
||||
"description": "How to handle embedded images",
|
||||
"required": False,
|
||||
"default": "skip",
|
||||
"allowed_values": ["skip", "extract", "placeholder"]
|
||||
},
|
||||
"table_alignment": {
|
||||
"name": "Table Alignment",
|
||||
"type": "enum",
|
||||
"description": "How to align table contents",
|
||||
"required": False,
|
||||
"default": "left",
|
||||
"allowed_values": ["left", "center", "preserve"]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user