- Addition of general chunking parameters chunking_heading_level and chunking patterns

- Addition of Processor types docx and markdown
2024-12-05 15:19:37 +01:00
parent 311927d5ea
commit d35ec9f5ae
17 changed files with 718 additions and 66 deletions
--- a/common/utils/config_field_types.py
+++ b/common/utils/config_field_types.py
@@ -64,6 +64,20 @@ class TaggingFields(BaseModel):
        }
 class ChunkingPatternsField(BaseModel):
    """Represents a set of chunking patterns"""
    patterns: List[str]
    @field_validator('patterns')
    def validate_patterns(cls, patterns):
        for pattern in patterns:
            try:
                re.compile(pattern)
            except re.error as e:
                raise ValueError(f"Invalid regex pattern '{pattern}': {str(e)}")
        return patterns
 class ArgumentConstraint(BaseModel):
    """Base class for all argument constraints"""
    description: Optional[str] = None
@@ -611,3 +625,38 @@ def _generate_yaml_docs(fields: Dict[str, Any], version: str) -> str:
        }
    return yaml.dump(doc, sort_keys=False, default_flow_style=False)
 def patterns_to_json(text_area_content: str) -> str:
    """Convert line-based patterns to JSON"""
    text_area_content = text_area_content.strip()
    if len(text_area_content) == 0:
        return json.dumps([])
    # Split on newlines and remove empty lines
    patterns = [line.strip() for line in text_area_content.split('\n') if line.strip()]
    return json.dumps(patterns)
 def json_to_patterns(json_content: str) -> str:
    """Convert JSON patterns list to text area content"""
    try:
        patterns = json.loads(json_content)
        if not isinstance(patterns, list):
            raise ValueError("JSON must contain a list of patterns")
        # Join with newlines
        return '\n'.join(patterns)
    except json.JSONDecodeError as e:
        raise ValueError(f"Invalid JSON format: {e}")
 def json_to_pattern_list(json_content: str) -> list:
    """Convert JSON patterns list to text area content"""
    try:
        patterns = json.loads(json_content)
        if not isinstance(patterns, list):
            raise ValueError("JSON must contain a list of patterns")
        # Unescape if needed
        patterns = [pattern.replace('\\\\', '\\') for pattern in patterns]
        return patterns
    except json.JSONDecodeError as e:
        raise ValueError(f"Invalid JSON format: {e}")
--- a/common/utils/document_utils.py
+++ b/common/utils/document_utils.py
@@ -12,7 +12,7 @@ import requests
 from urllib.parse import urlparse, unquote, urlunparse
 import os
 from .eveai_exceptions import (EveAIInvalidLanguageException, EveAIDoubleURLException, EveAIUnsupportedFileType,
-                               EveAIInvalidCatalog, EveAIInvalidDocument, EveAIInvalidDocumentVersion)
+                               EveAIInvalidCatalog, EveAIInvalidDocument, EveAIInvalidDocumentVersion, EveAIException)
 from ..models.user import Tenant
@@ -219,12 +219,6 @@ def start_embedding_task(tenant_id, doc_vers_id):
    return task.id
 def validate_file_type(extension):
    if extension not in current_app.config['SUPPORTED_FILE_TYPES']:
        raise EveAIUnsupportedFileType(f"Filetype {extension} is currently not supported. "
                                       f"Supported filetypes: {', '.join(current_app.config['SUPPORTED_FILE_TYPES'])}")
 def get_filename_from_url(url):
    parsed_url = urlparse(url)
    path_parts = parsed_url.path.split('/')
@@ -363,3 +357,109 @@ def cope_with_local_url(url):
    return url
 def lookup_document(tenant_id: int, lookup_criteria: dict, metadata_type: str) -> tuple[Document, DocumentVersion]:
    """
    Look up a document using metadata criteria
    Args:
        tenant_id: ID of the tenant
        lookup_criteria: Dictionary of key-value pairs to match in metadata
        metadata_type: Which metadata to search in ('user_metadata' or 'system_metadata')
    Returns:
        Tuple of (Document, DocumentVersion) if found
    Raises:
        ValueError: If invalid metadata_type provided
        EveAIException: If lookup fails
    """
    if metadata_type not in ['user_metadata', 'system_metadata']:
        raise ValueError(f"Invalid metadata_type: {metadata_type}")
    try:
        # Query for the latest document version matching the criteria
        query = (db.session.query(Document, DocumentVersion)
                 .join(DocumentVersion)
                 .filter(Document.id == DocumentVersion.doc_id)
                 .order_by(DocumentVersion.id.desc()))
        # Add metadata filtering using PostgreSQL JSONB operators
        metadata_field = getattr(DocumentVersion, metadata_type)
        for key, value in lookup_criteria.items():
            query = query.filter(metadata_field[key].astext == str(value))
        # Get first result
        result = query.first()
        if not result:
            raise EveAIException(
                f"No document found matching criteria in {metadata_type}",
                status_code=404
            )
        return result
    except SQLAlchemyError as e:
        current_app.logger.error(f'Database error during document lookup for tenant {tenant_id}: {e}')
        raise EveAIException(
            "Database error during document lookup",
            status_code=500
        )
    except Exception as e:
        current_app.logger.error(f'Error during document lookup for tenant {tenant_id}: {e}')
        raise EveAIException(
            "Error during document lookup",
            status_code=500
        )
 # Add to common/utils/document_utils.py
 def refresh_document_with_content(doc_id: int, tenant_id: int, file_content: bytes, api_input: dict) -> tuple:
    """
    Refresh document with new content
    Args:
        doc_id: Document ID
        tenant_id: Tenant ID
        file_content: New file content
        api_input: Additional document information
    Returns:
        Tuple of (new_version, task_id)
    """
    doc = Document.query.get(doc_id)
    if not doc:
        raise EveAIInvalidDocument(tenant_id, doc_id)
    old_doc_vers = DocumentVersion.query.filter_by(doc_id=doc_id).order_by(desc(DocumentVersion.id)).first()
    # Create new version with same file type as original
    extension = old_doc_vers.file_type
    new_doc_vers = create_version_for_document(
        doc, tenant_id,
        '',  # No URL for content-based updates
        old_doc_vers.sub_file_type,
        api_input.get('language', old_doc_vers.language),
        api_input.get('user_context', old_doc_vers.user_context),
        api_input.get('user_metadata', old_doc_vers.user_metadata),
        api_input.get('catalog_properties', old_doc_vers.catalog_properties),
    )
    try:
        db.session.add(new_doc_vers)
        db.session.commit()
    except SQLAlchemyError as e:
        db.session.rollback()
        return None, str(e)
    # Upload new content
    upload_file_for_version(new_doc_vers, file_content, extension, tenant_id)
    # Start embedding task
    task = current_celery.send_task('create_embeddings', args=[tenant_id, new_doc_vers.id], queue='embeddings')
    current_app.logger.info(f'Embedding creation started for document {doc_id} on version {new_doc_vers.id} '
                            f'with task id: {task.id}.')
    return new_doc_vers, task.id
--- a/config/config.py
+++ b/config/config.py
@@ -55,7 +55,6 @@ class Config(object):
    # file upload settings
    MAX_CONTENT_LENGTH = 50 * 1024 * 1024
    UPLOAD_EXTENSIONS = ['.txt', '.pdf', '.png', '.jpg', '.jpeg', '.gif']
    # supported languages
    SUPPORTED_LANGUAGES = ['en', 'fr', 'nl', 'de', 'es']
@@ -143,10 +142,7 @@ class Config(object):
    LANGCHAIN_ENDPOINT = 'https://api.smith.langchain.com'
    LANGCHAIN_PROJECT = "eveai"
-
+    TENANT_TYPES = ['Active', 'Demo', 'Inactive', 'Test']
    SUPPORTED_FILE_TYPES = ['pdf', 'html', 'md', 'txt', 'mp3', 'mp4', 'ogg', 'srt']
    TENANT_TYPES = ['Active', 'Demo', 'Inactive', 'Test', 'Wordpress Starter']
    # The maximum number of seconds allowed for audio compression (to save resources)
    MAX_COMPRESSION_DURATION = 60*10    # 10 minutes
--- a/config/type_defs/processor_types.py
+++ b/config/type_defs/processor_types.py
@@ -5,6 +5,19 @@ PROCESSOR_TYPES = {
        "file_types": "html",
        "Description": "A processor for HTML files",
        "configuration": {
            "chunking_patterns": {
                "name": "Chunking Patterns",
                "description": "A list of Patterns used to chunk files into logical pieces",
                "type": "chunking_patterns",
                "required": False
            },
            "chunking_heading_level": {
                "name": "Chunking Heading Level",
                "type": "integer",
                "description": "Maximum heading level to consider for chunking (1-6)",
                "required": False,
                "default": 2
            },
            "html_tags": {
                "name": "HTML Tags",
                "type": "string",
@@ -45,7 +58,21 @@ PROCESSOR_TYPES = {
        "name": "PDF Processor",
        "file_types": "pdf",
        "Description": "A Processor for PDF files",
-        "configuration": {}
+        "configuration": {
            "chunking_patterns": {
                "name": "Chunking Patterns",
                "description": "A list of Patterns used to chunk files into logical pieces",
                "type": "chunking_patterns",
                "required": False
            },
            "chunking_heading_level": {
                "name": "Chunking Heading Level",
                "type": "integer",
                "description": "Maximum heading level to consider for chunking (1-6)",
                "required": False,
                "default": 2
            },
        },
    },
    "AUDIO_PROCESSOR": {
        "name": "AUDIO Processor",
@@ -53,4 +80,89 @@ PROCESSOR_TYPES = {
        "Description": "A Processor for audio files",
        "configuration": {}
    },
    "MARKDOWN_PROCESSOR": {
        "name": "Markdown Processor",
        "file_types": "md",
        "Description": "A Processor for markdown files",
        "configuration": {
            "chunking_patterns": {
                "name": "Chunking Patterns",
                "description": "A list of Patterns used to chunk files into logical pieces",
                "type": "chunking_patterns",
                "required": False
            },
            "chunking_heading_level": {
                "name": "Chunking Heading Level",
                "type": "integer",
                "description": "Maximum heading level to consider for chunking (1-6)",
                "required": False,
                "default": 2
            },
        }
    },
    "DOCX_PROCESSOR": {
        "name": "DOCX Processor",
        "file_types": "docx",
        "Description": "A processor for DOCX files",
        "configuration": {
            "chunking_patterns": {
                "name": "Chunking Patterns",
                "description": "A list of Patterns used to chunk files into logical pieces",
                "type": "chunking_patterns",
                "required": False
            },
            "chunking_heading_level": {
                "name": "Chunking Heading Level",
                "type": "integer",
                "description": "Maximum heading level to consider for chunking (1-6)",
                "required": False,
                "default": 2
            },
            "extract_comments": {
                "name": "Extract Comments",
                "type": "boolean",
                "description": "Whether to include document comments in the markdown",
                "required": False,
                "default": False
            },
            "extract_headers_footers": {
                "name": "Extract Headers/Footers",
                "type": "boolean",
                "description": "Whether to include headers and footers in the markdown",
                "required": False,
                "default": False
            },
            "preserve_formatting": {
                "name": "Preserve Formatting",
                "type": "boolean",
                "description": "Whether to preserve bold, italic, and other text formatting",
                "required": False,
                "default": True
            },
            "list_style": {
                "name": "List Style",
                "type": "enum",
                "description": "How to format lists in markdown",
                "required": False,
                "default": "dash",
                "allowed_values": ["dash", "asterisk", "plus"]
            },
            "image_handling": {
                "name": "Image Handling",
                "type": "enum",
                "description": "How to handle embedded images",
                "required": False,
                "default": "skip",
                "allowed_values": ["skip", "extract", "placeholder"]
            },
            "table_alignment": {
                "name": "Table Alignment",
                "type": "enum",
                "description": "How to align table contents",
                "required": False,
                "default": "left",
                "allowed_values": ["left", "center", "preserve"]
            }
        }
    }
 }
--- a/eveai_api/api/document_api.py
+++ b/eveai_api/api/document_api.py
@@ -11,9 +11,10 @@ from common.utils.document_utils import (
    create_document_stack, process_url, start_embedding_task,
    validate_file_type, EveAIInvalidLanguageException, EveAIDoubleURLException, EveAIUnsupportedFileType,
    get_documents_list, edit_document, refresh_document, edit_document_version,
-    refresh_document_with_info
+    refresh_document_with_info, lookup_document
 )
 from common.utils.eveai_exceptions import EveAIException
 from eveai_api.api.auth import requires_service
 def validate_date(date_str):
@@ -59,6 +60,7 @@ add_document_response = document_ns.model('AddDocumentResponse', {
@document_ns.route('/add_document')
 class AddDocument(Resource):
    @jwt_required()
    @requires_service('DOCAPI')
    @document_ns.expect(upload_parser)
    @document_ns.response(201, 'Document added successfully', add_document_response)
    @document_ns.response(400, 'Validation Error')
@@ -134,6 +136,7 @@ add_url_response = document_ns.model('AddURLResponse', {
@document_ns.route('/add_url')
 class AddURL(Resource):
    @jwt_required()
    @requires_service('DOCAPI')
    @document_ns.expect(add_url_model)
    @document_ns.response(201, 'Document added successfully', add_url_response)
    @document_ns.response(400, 'Validation Error')
@@ -190,6 +193,7 @@ document_list_model = document_ns.model('DocumentList', {
@document_ns.route('/list')
 class DocumentList(Resource):
    @jwt_required()
    @requires_service('DOCAPI')
    @document_ns.doc('list_documents')
    @document_ns.marshal_list_with(document_list_model, envelope='documents')
    def get(self):
@@ -210,6 +214,7 @@ edit_document_model = document_ns.model('EditDocument', {
@document_ns.route('/<int:document_id>')
 class DocumentResource(Resource):
    @jwt_required()
    @requires_service('DOCAPI')
    @document_ns.doc('edit_document')
    @document_ns.expect(edit_document_model)
    @document_ns.response(200, 'Document updated successfully')
@@ -232,6 +237,7 @@ class DocumentResource(Resource):
            return e.to_dict(), e.status_code
    @jwt_required()
    @requires_service('DOCAPI')
    @document_ns.doc('refresh_document')
    @document_ns.response(200, 'Document refreshed successfully')
    def post(self, document_id):
@@ -253,6 +259,7 @@ edit_document_version_model = document_ns.model('EditDocumentVersion', {
@document_ns.route('/version/<int:version_id>')
 class DocumentVersionResource(Resource):
    @jwt_required()
    @requires_service('DOCAPI')
    @document_ns.doc('edit_document_version')
    @document_ns.expect(edit_document_version_model)
    @document_ns.response(200, 'Document version updated successfully')
@@ -280,6 +287,7 @@ refresh_document_model = document_ns.model('RefreshDocument', {
@document_ns.route('/<int:document_id>/refresh')
 class RefreshDocument(Resource):
    @jwt_required()
    @requires_service('DOCAPI')
    @document_ns.response(200, 'Document refreshed successfully')
    @document_ns.response(404, 'Document not found')
    def post(self, document_id):
@@ -310,6 +318,7 @@ class RefreshDocument(Resource):
@document_ns.route('/<int:document_id>/refresh_with_info')
 class RefreshDocumentWithInfo(Resource):
    @jwt_required()
    @requires_service('DOCAPI')
    @document_ns.expect(refresh_document_model)
    @document_ns.response(200, 'Document refreshed successfully')
    @document_ns.response(400, 'Validation Error')
@@ -338,3 +347,112 @@ class RefreshDocumentWithInfo(Resource):
        except Exception as e:
            current_app.logger.error(f'Error refreshing document with info: {str(e)}')
            return {'message': 'Internal server error'}, 500
 # Define models for lookup requests
 lookup_model = document_ns.model('DocumentLookup', {
    'lookup_criteria': fields.Raw(required=True,
                                  description='JSON object containing key-value pairs to match in metadata. '
                                              'Example: {"external_id": "123", "source": "zapier", "source_type": "google_docs"}'),
    'metadata_type': fields.String(required=True, enum=['user_metadata', 'system_metadata'],
                                   description='Which metadata field to search in')
 })
 lookup_response = document_ns.model('DocumentLookupResponse', {
    'document_id': fields.Integer(description='ID of the found document'),
    'document_version_id': fields.Integer(description='ID of the latest document version'),
    'name': fields.String(description='Document name'),
    'metadata': fields.Raw(description='Full metadata of the found document')
 })
@document_ns.route('/lookup')
 class DocumentLookup(Resource):
    @jwt_required()
    @requires_service('DOCAPI')
    @document_ns.expect(lookup_model)
    @document_ns.marshal_with(lookup_response)
    @document_ns.response(200, 'Document found', lookup_response)
    @document_ns.response(404, 'No document found matching criteria')
    def post(self):
        """
        Look up a document using metadata criteria
        """
        tenant_id = get_jwt_identity()
        try:
            data = request.json
            document, version = lookup_document(
                tenant_id,
                data['lookup_criteria'],
                data['metadata_type']
            )
            return {
                'document_id': document.id,
                'document_version_id': version.id,
                'name': document.name,
                'metadata': getattr(version, data['metadata_type'])
            }
        except EveAIException as e:
            return e.to_dict(), e.status_code
        except KeyError as e:
            return {'message': f'Missing required field: {str(e)}'}, 400
 refresh_content_model = document_ns.model('RefreshDocumentContent', {
    'file_content': fields.Raw(required=True, description='The new file content'),
    'language': fields.String(required=False, description='Language of the document'),
    'user_context': fields.String(required=False, description='User context for the document'),
    'user_metadata': fields.Raw(required=False, description='Custom metadata fields'),
    'catalog_properties': fields.Raw(required=False, description='Catalog-specific properties'),
    'trigger_service': fields.String(required=False, description='Service that triggered the update')
 })
@document_ns.route('/<int:document_id>/refresh_content')
 class RefreshDocumentContent(Resource):
    @jwt_required()
    @requires_service('DOCAPI')
    @document_ns.expect(refresh_content_model)
    @document_ns.response(200, 'Document refreshed successfully')
    def post(self, document_id):
        """Refresh a document with new content"""
        tenant_id = get_jwt_identity()
        try:
            data = request.json
            file_content = data['file_content']
            # Build user_metadata by merging:
            # 1. Existing metadata (if any)
            # 2. New metadata from request
            # 3. Zapier-specific fields
            user_metadata = data.get('user_metadata', {})
            user_metadata.update({
                'source': 'zapier',
                'trigger_service': data.get('trigger_service')
            })
            data['user_metadata'] = user_metadata
            # Keep catalog_properties separate
            if 'catalog_properties' in data:
                # We could add validation here against catalog configuration
                data['catalog_properties'] = data['catalog_properties']
            new_version, task_id = refresh_document_with_content(
                document_id,
                tenant_id,
                file_content,
                data
            )
            return {
                'message': f'Document refreshed successfully. New version: {new_version.id}. Task ID: {task_id}',
                'document_id': document_id,
                'document_version_id': new_version.id,
                'task_id': task_id
            }, 200
        except EveAIException as e:
            return e.to_dict(), e.status_code
--- a/eveai_app/views/document_forms.py
+++ b/eveai_app/views/document_forms.py
@@ -15,14 +15,6 @@ from config.type_defs.retriever_types import RETRIEVER_TYPES
 from .dynamic_form_base import DynamicFormBase
 def allowed_file(form, field):
    if field.data:
        filename = field.data.filename
        allowed_extensions = current_app.config.get('SUPPORTED_FILE_TYPES', [])
        if not ('.' in filename and filename.rsplit('.', 1)[1].lower() in allowed_extensions):
            raise ValidationError('Unsupported file type.')
 def validate_json(form, field):
    if field.data:
        try:
@@ -101,7 +93,10 @@ class ProcessorForm(FlaskForm):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Dynamically populate the 'type' field using the constructor
-        self.type.choices = [(key, value['name']) for key, value in PROCESSOR_TYPES.items()]
+        self.type.choices = sorted(
            [(key, value['name']) for key, value in PROCESSOR_TYPES.items()],
            key=lambda x: x[1],
        )
 class EditProcessorForm(DynamicFormBase):
@@ -177,7 +172,7 @@ class EditRetrieverForm(DynamicFormBase):
 class AddDocumentForm(DynamicFormBase):
-    file = FileField('File', validators=[FileRequired(), allowed_file])
+    file = FileField('File', validators=[FileRequired()])
    catalog = StringField('Catalog', render_kw={'readonly': True})
    sub_file_type = StringField('Sub File Type', validators=[Optional(), Length(max=50)])
    name = StringField('Name', validators=[Length(max=100)])
--- a/eveai_app/views/document_views.py
+++ b/eveai_app/views/document_views.py
@@ -14,7 +14,7 @@ import json
 from common.models.document import Document, DocumentVersion, Catalog, Retriever, Processor
 from common.extensions import db
 from common.models.interaction import Specialist, SpecialistRetriever
-from common.utils.document_utils import validate_file_type, create_document_stack, start_embedding_task, process_url, \
+from common.utils.document_utils import create_document_stack, start_embedding_task, process_url, \
    edit_document, \
    edit_document_version, refresh_document
 from common.utils.eveai_exceptions import EveAIInvalidLanguageException, EveAIUnsupportedFileType, \
@@ -391,9 +391,6 @@ def add_document():
            sub_file_type = form.sub_file_type.data
            filename = secure_filename(file.filename)
            extension = filename.rsplit('.', 1)[1].lower()
            validate_file_type(extension)
            catalog_properties = {}
            document_version_configurations = CATALOG_TYPES[catalog.type]['document_version_configurations']
            for config in document_version_configurations:
--- a/eveai_app/views/dynamic_form_base.py
+++ b/eveai_app/views/dynamic_form_base.py
@@ -5,7 +5,46 @@ import json
 from wtforms.fields.choices import SelectField
 from wtforms.fields.datetime import DateField
-from common.utils.config_field_types import TaggingFields
+from common.utils.config_field_types import TaggingFields, json_to_patterns, patterns_to_json
 class TaggingFieldsField(TextAreaField):
    def __init__(self, *args, **kwargs):
        kwargs['render_kw'] = {
            'class': 'chunking-patterns-field',
            'data-handle-enter': 'true'
        }
        super().__init__(*args, **kwargs)
    # def _value(self):
    #     if self.data:
    #         return json.dumps(self.data)
    #     return ''
    #
    # def process_formdata(self, valuelist):
    #     if valuelist and valuelist[0]:
    #         try:
    #             self.data = json.loads(valuelist[0])
    #         except json.JSONDecodeError as e:
    #             raise ValueError('Not valid JSON content')
 class ChunkingPatternsField(TextAreaField):
    def __init__(self, *args, **kwargs):
        kwargs['render_kw'] = {
            'class': 'chunking-patterns-field',
            'data-handle-enter': 'true'
        }
        super().__init__(*args, **kwargs)
    # def _value(self):
    #     if self.data:
    #         return '\n'.join(self.data)
    #     return ''
    #
    # def process_formdata(self, valuelist):
    #     if valuelist and valuelist[0]:
    #         self.data = [line.strip() for line in valuelist[0].split('\n') if line.strip()]
 class DynamicFormBase(FlaskForm):
@@ -80,7 +119,7 @@ class DynamicFormBase(FlaskForm):
            # Handle special case for tagging_fields
            if field_type == 'tagging_fields':
-                field_class = TextAreaField
+                field_class = TaggingFieldsField
                extra_classes = 'json-editor'
                field_kwargs = {}
            elif field_type == 'enum':
@@ -89,6 +128,10 @@ class DynamicFormBase(FlaskForm):
                choices = [(str(val), str(val)) for val in allowed_values]
                extra_classes = ''
                field_kwargs = {'choices': choices}
            elif field_type == 'chunking_patterns':
                field_class = ChunkingPatternsField
                extra_classes = ['monospace-text', 'pattern-input']
                field_kwargs = {}
            else:
                extra_classes = ''
                field_class = {
@@ -111,6 +154,12 @@ class DynamicFormBase(FlaskForm):
                    except (TypeError, ValueError) as e:
                        current_app.logger.error(f"Error converting initial data to JSON: {e}")
                        field_data = "{}"
                elif field_type == 'chunking_patterns':
                    try:
                        field_data = json_to_patterns(field_data)
                    except (TypeError, ValueError) as e:
                        current_app.logger.error(f"Error converting initial data to a list of patterns: {e}")
                        field_data = {}
            elif default is not None:
                field_data = default
@@ -173,12 +222,17 @@ class DynamicFormBase(FlaskForm):
            original_field_name = full_field_name[prefix_length:]
            field = getattr(self, full_field_name)
            # Parse JSON for tagging_fields type
-            if isinstance(field, TextAreaField) and field.data:
+            if isinstance(field, TaggingFieldsField) and field.data:
                try:
                    data[original_field_name] = json.loads(field.data)
                except json.JSONDecodeError:
                    # Validation should catch this, but just in case
                    data[original_field_name] = field.data
            elif isinstance(field, ChunkingPatternsField):
                try:
                    data[original_field_name] = patterns_to_json(field.data)
                except Exception as e:
                    current_app.logger.error(f"Error converting initial data to patterns: {e}")
            else:
                data[original_field_name] = field.data
        return data
@@ -230,5 +284,3 @@ def validate_tagging_fields(form, field):
    except (TypeError, ValueError) as e:
        raise ValidationError(f"Invalid field definition: {str(e)}")
--- a/eveai_workers/processors/audio_processor.py
+++ b/eveai_workers/processors/audio_processor.py
@@ -46,7 +46,7 @@ class AudioProcessor(TranscriptionBaseProcessor):
        try:
            audio_info = AudioSegment.from_file(temp_file_path, format=self.document_version.file_type)
            total_duration = len(audio_info)
-            self._log_tuning("_compress_audio", {
+            self.log_tuning("_compress_audio", {
                "Audio Duration (ms)": total_duration,
            })
            segment_length = self.max_compression_duration * 1000  # Convert to milliseconds
@@ -55,7 +55,7 @@ class AudioProcessor(TranscriptionBaseProcessor):
            compressed_segments = AudioSegment.empty()
            for i in range(total_chunks):
-                self._log_tuning("_compress_audio", {
+                self.log_tuning("_compress_audio", {
                    "Segment Nr": f"{i + 1} of {total_chunks}"
                })
@@ -87,7 +87,7 @@ class AudioProcessor(TranscriptionBaseProcessor):
                    compressed_filename,
                    compressed_buffer.read()
                )
-            self._log_tuning("_compress_audio", {
+            self.log_tuning("_compress_audio", {
                "Compressed audio to MinIO": compressed_filename
            })
@@ -172,14 +172,14 @@ class AudioProcessor(TranscriptionBaseProcessor):
                        transcriptions.append(trans)
-                        self._log_tuning("_transcribe_audio", {
+                        self.log_tuning("_transcribe_audio", {
                            "Chunk Nr": f"{i + 1} of {total_chunks}",
                            "Segment Duration": segment_duration,
                            "Transcription": trans,
                        })
                    else:
                        self._log("Warning: Received empty transcription", level='warning')
-                        self._log_tuning("_transcribe_audio", {"ERROR": "No transcription"})
+                        self.log_tuning("_transcribe_audio", {"ERROR": "No transcription"})
                except Exception as e:
                    self._log(f"Error during transcription: {str(e)}", level='error')
@@ -202,7 +202,7 @@ class AudioProcessor(TranscriptionBaseProcessor):
            transcription_filename,
            full_transcription.encode('utf-8')
        )
-        self._log_tuning(f"Saved transcription to MinIO: {transcription_filename}")
+        self.log_tuning(f"Saved transcription to MinIO: {transcription_filename}")
        return full_transcription
--- a/eveai_workers/processors/base_processor.py
+++ b/eveai_workers/processors/base_processor.py
@@ -17,7 +17,7 @@ class BaseProcessor(ABC):
        self.tuning_logger = None
        self._setup_tuning_logger()
-        self._log_tuning("Processor initialized", {
+        self.log_tuning("Processor initialized", {
            "processor_type": processor.type if processor else None,
            "document_version": document_version.id if document_version else None,
            "catalog": catalog.id if catalog else None
@@ -42,6 +42,10 @@ class BaseProcessor(ABC):
    def process(self):
        pass
    @property
    def configuration(self):
        return self.processor.configuration
    def _save_markdown(self, markdown):
        markdown_filename = f"{self.document_version.id}.md"
        minio_client.upload_document_file(
@@ -78,7 +82,7 @@ class BaseProcessor(ABC):
        return markdown
-    def _log_tuning(self, message: str, data: Dict[str, Any] = None) -> None:
+    def log_tuning(self, message: str, data: Dict[str, Any] = None) -> None:
        if self.tuning and self.tuning_logger:
            try:
                self.tuning_logger.log_tuning('processor', message, data)
--- a/eveai_workers/processors/docx_processor.py
+++ b/eveai_workers/processors/docx_processor.py
@@ -0,0 +1,129 @@
 import docx
 import io
 from .base_processor import BaseProcessor
 from .processor_registry import ProcessorRegistry
 from common.extensions import minio_client
 import re
 class DocxProcessor(BaseProcessor):
    def __init__(self, tenant, model_variables, document_version, catalog, processor):
        super().__init__(tenant, model_variables, document_version, catalog, processor)
        self.config = processor.configuration
        self.extract_comments = self.config.get('extract_comments', False)
        self.extract_headers_footers = self.config.get('extract_headers_footers', False)
        self.preserve_formatting = self.config.get('preserve_formatting', True)
        self.list_style = self.config.get('list_style', 'dash')
        self.image_handling = self.config.get('image_handling', 'skip')
        self.table_alignment = self.config.get('table_alignment', 'left')
    def process(self):
        try:
            file_data = minio_client.download_document_file(
                self.tenant.id,
                self.document_version.bucket_name,
                self.document_version.object_name,
            )
            doc = docx.Document(io.BytesIO(file_data))
            markdown = self._convert_to_markdown(doc)
            title = self._extract_title(doc)
            self._save_markdown(markdown)
            return markdown, title
        except Exception as e:
            self._log(f"Error processing DOCX: {str(e)}", level='error')
            raise
    def _convert_to_markdown(self, doc):
        markdown_parts = []
        if self.extract_headers_footers:
            for section in doc.sections:
                if section.header.paragraphs:
                    markdown_parts.extend(self._process_paragraphs(section.header.paragraphs))
        markdown_parts.extend(self._process_paragraphs(doc.paragraphs))
        if self.extract_comments and doc.comments:
            markdown_parts.append("\n## Comments\n")
            for comment in doc.comments:
                markdown_parts.append(f"> {comment.text}\n")
        return "\n".join(markdown_parts)
    def _process_paragraphs(self, paragraphs):
        markdown_parts = []
        in_list = False
        for para in paragraphs:
            if not para.text.strip():
                continue
            style = para.style.name.lower()
            if 'heading' in style:
                level = int(style[-1]) if style[-1].isdigit() else 1
                markdown_parts.append(f"{'#' * level} {para.text}\n")
            elif para._p.pPr and para._p.pPr.numPr:  # List item
                marker = self._get_list_marker()
                markdown_parts.append(f"{marker} {para.text}\n")
                in_list = True
            else:
                if in_list:
                    markdown_parts.append("\n")
                    in_list = False
                text = para.text
                if self.preserve_formatting:
                    text = self._apply_formatting(para)
                markdown_parts.append(f"{text}\n")
        return markdown_parts
    def _get_list_marker(self):
        return {
            'dash': '-',
            'asterisk': '*',
            'plus': '+'
        }.get(self.list_style, '-')
    def _apply_formatting(self, paragraph):
        text = paragraph.text
        if not text:
            return ""
        runs = paragraph.runs
        formatted_parts = []
        for run in runs:
            part = run.text
            if run.bold:
                part = f"**{part}**"
            if run.italic:
                part = f"*{part}*"
            if run.underline:
                part = f"__{part}__"
            formatted_parts.append(part)
        return "".join(formatted_parts)
    def _extract_title(self, doc):
        if doc.paragraphs:
            first_para = doc.paragraphs[0]
            if 'heading' in first_para.style.name.lower():
                return first_para.text.strip()
            # Look for first Heading 1 in document
            for para in doc.paragraphs:
                if para.style.name.lower() == 'heading 1':
                    return para.text.strip()
        return "Untitled Document"
 ProcessorRegistry.register("DOCX_PROCESSOR", DocxProcessor)
--- a/eveai_workers/processors/html_processor.py
+++ b/eveai_workers/processors/html_processor.py
@@ -24,7 +24,7 @@ class HTMLProcessor(BaseProcessor):
        # Add verification logging
        self._log(f"HTML Processor initialized with tuning={self.tuning}")
        if self.tuning:
-            self._log_tuning("HTML Processor initialized", {
+            self.log_tuning("HTML Processor initialized", {
                "html_tags": self.html_tags,
                "html_end_tags": self.html_end_tags,
                "included_elements": self.html_included_elements,
@@ -75,7 +75,7 @@ class HTMLProcessor(BaseProcessor):
        title = soup.find('title').get_text(strip=True) if soup.find('title') else ''
        self._log(f'Finished parsing HTML for tenant {self.tenant.id}')
-        self._log_tuning("_parse_html", {"extracted_html": extracted_html, "title": title})
+        self.log_tuning("_parse_html", {"extracted_html": extracted_html, "title": title})
        return extracted_html, title
    def _generate_markdown_from_html(self, html_content):
@@ -96,7 +96,7 @@ class HTMLProcessor(BaseProcessor):
            input_html = {"html": chunk}
            markdown_chunk = chain.invoke(input_html)
            markdown_chunks.append(markdown_chunk)
-            self._log_tuning("_generate_markdown_from_html", {"chunk": chunk, "markdown_chunk": markdown_chunk})
+            self.log_tuning("_generate_markdown_from_html", {"chunk": chunk, "markdown_chunk": markdown_chunk})
        markdown = "\n\n".join(markdown_chunks)
        self._log(f'Finished generating markdown from HTML for tenant {self.tenant.id}')
--- a/eveai_workers/processors/markdown_processor.py
+++ b/eveai_workers/processors/markdown_processor.py
@@ -0,0 +1,48 @@
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate
 import re
 from langchain_core.runnables import RunnablePassthrough
 from common.extensions import minio_client
 from common.utils.model_utils import create_language_template
 from .base_processor import BaseProcessor
 from common.utils.business_event_context import current_event
 from .processor_registry import ProcessorRegistry
 def _find_first_h1(markdown: str) -> str:
    # Look for # Header (allowing spaces after #)
    match = re.search(r'^#\s+(.+)$', markdown, re.MULTILINE)
    return match.group(1).strip() if match else ""
 class MarkdownProcessor(BaseProcessor):
    def __init__(self, tenant, model_variables, document_version, catalog, processor):
        super().__init__(tenant, model_variables, document_version, catalog, processor)
        self.chunk_size = catalog.max_chunk_size
        self.chunk_overlap = 0
        self.tuning = self.processor.tuning
    def process(self):
        self._log("Starting Markdown processing")
        try:
            file_data = minio_client.download_document_file(
                self.tenant.id,
                self.document_version.bucket_name,
                self.document_version.object_name,
            )
            markdown = file_data.decode('utf-8')
            title = _find_first_h1(markdown)
            self._save_markdown(markdown)
            self._log("Finished processing Markdown")
            return markdown, title
        except Exception as e:
            self._log(f"Error processing Markdown: {str(e)}", level='error')
            raise
 ProcessorRegistry.register("MARKDOWN_PROCESSOR", MarkdownProcessor)
--- a/eveai_workers/processors/pdf_processor.py
+++ b/eveai_workers/processors/pdf_processor.py
@@ -57,7 +57,7 @@ class PDFProcessor(BaseProcessor):
                    'figures': self._extract_figures(page, page_num, figure_counter),
                    'tables': self._extract_tables(page)
                }
-                self._log_tuning("_extract_content", {"page_num": page_num, "page_content": page_content})
+                self.log_tuning("_extract_content", {"page_num": page_num, "page_content": page_content})
                figure_counter += len(page_content['figures'])
                extracted_content.append(page_content)
@@ -119,7 +119,7 @@ class PDFProcessor(BaseProcessor):
                    markdown_table = self._table_to_markdown(table)
                    if markdown_table:  # Only add non-empty tables
                        tables.append(markdown_table)
-                        self._log_tuning("_extract_tables", {"markdown_table": markdown_table})
+                        self.log_tuning("_extract_tables", {"markdown_table": markdown_table})
        except Exception as e:
            self._log(f"Error extracting tables from page: {str(e)}", level='error')
        return tables
--- a/eveai_workers/processors/transcription_processor.py
+++ b/eveai_workers/processors/transcription_processor.py
@@ -45,7 +45,7 @@ class TranscriptionBaseProcessor(BaseProcessor):
        return text_splitter.split_text(transcription)
    def _process_chunks(self, chunks):
-        self._log_tuning("_process_chunks", {"Nr of Chunks": len(chunks)})
+        self.log_tuning("_process_chunks", {"Nr of Chunks": len(chunks)})
        llm = self.model_variables.get_llm()
        template = self.model_variables.get_template('transcript')
        language_template = create_language_template(template, self.document_version.language)
@@ -64,7 +64,7 @@ class TranscriptionBaseProcessor(BaseProcessor):
            }
            markdown = chain.invoke(input_transcript)
            markdown = self._clean_markdown(markdown)
-            self._log_tuning("_process_chunks", {
+            self.log_tuning("_process_chunks", {
                "Chunk Number": f"{i + 1} of {len(chunks)}",
                "Chunk": chunk,
                "Previous Chunk": previous_part,
--- a/eveai_workers/tasks.py
+++ b/eveai_workers/tasks.py
@@ -1,3 +1,4 @@
 import re
 from datetime import datetime as dt, timezone as tz
 from celery import states
@@ -23,6 +24,8 @@ from common.utils.business_event_context import current_event
 from config.type_defs.processor_types import PROCESSOR_TYPES
 from eveai_workers.processors.processor_registry import ProcessorRegistry
 from common.utils.config_field_types import json_to_pattern_list
 # Healthcheck task
@current_celery.task(name='ping', queue='embeddings')
@@ -99,9 +102,13 @@ def create_embeddings(tenant_id, document_version_id):
                    processor=processor
                )
                markdown, title = document_processor.process()
                document_processor.log_tuning("Processor returned: ", {
                    'markdown': markdown,
                    'title': title
                })
            with current_event.create_span("Embedding"):
-                embed_markdown(tenant, model_variables, document_version, catalog, markdown, title)
+                embed_markdown(tenant, model_variables, document_version, catalog, document_processor, markdown, title)
            current_event.log("Finished Embedding Creation Task")
@@ -129,16 +136,19 @@ def delete_embeddings_for_document_version(document_version):
        raise
-def embed_markdown(tenant, model_variables, document_version, catalog, markdown, title):
+def embed_markdown(tenant, model_variables, document_version, catalog, processor, markdown, title):
    # Create potential chunks
-    potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, f"{document_version.id}.md")
+    potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, processor, markdown)
    processor.log_tuning("Potential Chunks: ", {'potential chunks': potential_chunks})
    # Combine chunks for embedding
-    chunks = combine_chunks_for_markdown(potential_chunks, catalog.min_chunk_size, catalog.max_chunk_size)
+    chunks = combine_chunks_for_markdown(potential_chunks, catalog.min_chunk_size, catalog.max_chunk_size, processor)
    processor.log_tuning("Chunks: ", {'chunks': chunks})
    # Enrich chunks
    with current_event.create_span("Enrich Chunks"):
        enriched_chunks = enrich_chunks(tenant, model_variables, document_version, title, chunks)
        processor.log_tuning("Enriched Chunks: ", {'enriched_chunks': enriched_chunks})
    # Create embeddings
    with current_event.create_span("Create Embeddings"):
@@ -238,23 +248,17 @@ def embed_chunks(tenant, model_variables, document_version, chunks):
    return new_embeddings
-def create_potential_chunks_for_markdown(tenant_id, document_version, input_file):
+def create_potential_chunks_for_markdown(tenant_id, document_version, processor, markdown):
    try:
        current_app.logger.info(f'Creating potential chunks for tenant {tenant_id}')
-        markdown_on = document_version.object_name.rsplit('.', 1)[0] + '.md'
+        heading_level = processor.configuration.get('chunking_heading_level', 2)
        # Download the markdown file from MinIO
        markdown_data = minio_client.download_document_file(tenant_id,
                                                            document_version.bucket_name,
                                                            markdown_on,
                                                            )
        markdown = markdown_data.decode('utf-8')
        headers_to_split_on = [
-            ("#", "Header 1"),
+            (f"{'#' * i}", f"Header {i}") for i in range(1, min(heading_level + 1, 7))
            ("##", "Header 2"),
        ]
        processor.log_tuning('Headers to split on', {'header list: ': headers_to_split_on})
        markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
        md_header_splits = markdown_splitter.split_text(markdown)
        potential_chunks = [doc.page_content for doc in md_header_splits]
@@ -265,14 +269,61 @@ def create_potential_chunks_for_markdown(tenant_id, document_version, input_file
        raise
-def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):
+def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars, processor):
    actual_chunks = []
    current_chunk = ""
    current_length = 0
    def matches_chunking_pattern(text, patterns):
        if not patterns:
            return False
        # Get the first line of the text
        first_line = text.split('\n', 1)[0].strip()
        # Check if it's a header at appropriate level
        header_match = re.match(r'^(#{1,6})\s+(.+)$', first_line)
        if not header_match:
            return False
        # Get the heading level (number of #s)
        header_level = len(header_match.group(1))
        # Get the header text
        header_text = header_match.group(2)
        # Check if header matches any pattern
        for pattern in patterns:
            try:
                processor.log_tuning('Pattern check: ', {
                    'pattern: ': pattern,
                    'text': header_text
                })
                if re.search(pattern, header_text, re.IGNORECASE):
                    return True
            except Exception as e:
                current_app.logger.warning(f"Invalid regex pattern '{pattern}': {str(e)}")
                continue
        return False
    chunking_patterns = json_to_pattern_list(processor.configuration.get('chunking_patterns', []))
    processor.log_tuning(f'Chunking Patterns Extraction: ', {
        'Full Configuration': processor.configuration,
        'Chunking Patterns': chunking_patterns,
    })
    for chunk in potential_chunks:
        chunk_length = len(chunk)
        # Force new chunk if pattern matches
        if chunking_patterns and matches_chunking_pattern(chunk, chunking_patterns):
            if current_chunk and current_length >= min_chars:
                actual_chunks.append(current_chunk)
            current_chunk = chunk
            current_length = chunk_length
            continue
        if current_length + chunk_length > max_chars:
            if current_length >= min_chars:
                actual_chunks.append(current_chunk)
--- a/requirements.txt
+++ b/requirements.txt
@@ -89,3 +89,4 @@ prometheus_flask_exporter~=0.23.1
 prometheus_client~=0.20.0
 babel~=2.16.0
 dogpile.cache~=1.3.3
 python-docx~=1.1.2