- Addition of general chunking parameters chunking_heading_level and chunking patterns

- Addition of Processor types docx and markdown
2024-12-05 15:19:37 +01:00
parent 311927d5ea
commit d35ec9f5ae
17 changed files with 718 additions and 66 deletions
--- a/eveai_workers/tasks.py
+++ b/eveai_workers/tasks.py
@@ -1,3 +1,4 @@
+import re
 from datetime import datetime as dt, timezone as tz

 from celery import states
@@ -23,6 +24,8 @@ from common.utils.business_event_context import current_event
 from config.type_defs.processor_types import PROCESSOR_TYPES
 from eveai_workers.processors.processor_registry import ProcessorRegistry

+from common.utils.config_field_types import json_to_pattern_list
+

 # Healthcheck task
@current_celery.task(name='ping', queue='embeddings')
@@ -99,9 +102,13 @@ def create_embeddings(tenant_id, document_version_id):
                    processor=processor
                )
                markdown, title = document_processor.process()
+                document_processor.log_tuning("Processor returned: ", {
+                    'markdown': markdown,
+                    'title': title
+                })

            with current_event.create_span("Embedding"):
-                embed_markdown(tenant, model_variables, document_version, catalog, markdown, title)
+                embed_markdown(tenant, model_variables, document_version, catalog, document_processor, markdown, title)

            current_event.log("Finished Embedding Creation Task")

@@ -129,16 +136,19 @@ def delete_embeddings_for_document_version(document_version):
        raise


-def embed_markdown(tenant, model_variables, document_version, catalog, markdown, title):
+def embed_markdown(tenant, model_variables, document_version, catalog, processor, markdown, title):
    # Create potential chunks
-    potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, f"{document_version.id}.md")
+    potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, processor, markdown)
+    processor.log_tuning("Potential Chunks: ", {'potential chunks': potential_chunks})

    # Combine chunks for embedding
-    chunks = combine_chunks_for_markdown(potential_chunks, catalog.min_chunk_size, catalog.max_chunk_size)
+    chunks = combine_chunks_for_markdown(potential_chunks, catalog.min_chunk_size, catalog.max_chunk_size, processor)
+    processor.log_tuning("Chunks: ", {'chunks': chunks})

    # Enrich chunks
    with current_event.create_span("Enrich Chunks"):
        enriched_chunks = enrich_chunks(tenant, model_variables, document_version, title, chunks)
+        processor.log_tuning("Enriched Chunks: ", {'enriched_chunks': enriched_chunks})

    # Create embeddings
    with current_event.create_span("Create Embeddings"):
@@ -238,23 +248,17 @@ def embed_chunks(tenant, model_variables, document_version, chunks):
    return new_embeddings


-def create_potential_chunks_for_markdown(tenant_id, document_version, input_file):
+def create_potential_chunks_for_markdown(tenant_id, document_version, processor, markdown):
    try:
        current_app.logger.info(f'Creating potential chunks for tenant {tenant_id}')
-        markdown_on = document_version.object_name.rsplit('.', 1)[0] + '.md'
-
-        # Download the markdown file from MinIO
-        markdown_data = minio_client.download_document_file(tenant_id,
-                                                            document_version.bucket_name,
-                                                            markdown_on,
-                                                            )
-        markdown = markdown_data.decode('utf-8')
+        heading_level = processor.configuration.get('chunking_heading_level', 2)

        headers_to_split_on = [
-            ("#", "Header 1"),
-            ("##", "Header 2"),
+            (f"{'#' * i}", f"Header {i}") for i in range(1, min(heading_level + 1, 7))
        ]

+        processor.log_tuning('Headers to split on', {'header list: ': headers_to_split_on})
+
        markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
        md_header_splits = markdown_splitter.split_text(markdown)
        potential_chunks = [doc.page_content for doc in md_header_splits]
@@ -265,14 +269,61 @@ def create_potential_chunks_for_markdown(tenant_id, document_version, input_file
        raise


-def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):
+def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars, processor):
    actual_chunks = []
    current_chunk = ""
    current_length = 0

+    def matches_chunking_pattern(text, patterns):
+        if not patterns:
+            return False
+
+        # Get the first line of the text
+        first_line = text.split('\n', 1)[0].strip()
+
+        # Check if it's a header at appropriate level
+        header_match = re.match(r'^(#{1,6})\s+(.+)$', first_line)
+        if not header_match:
+            return False
+
+        # Get the heading level (number of #s)
+        header_level = len(header_match.group(1))
+        # Get the header text
+        header_text = header_match.group(2)
+
+        # Check if header matches any pattern
+        for pattern in patterns:
+            try:
+                processor.log_tuning('Pattern check: ', {
+                    'pattern: ': pattern,
+                    'text': header_text
+                })
+                if re.search(pattern, header_text, re.IGNORECASE):
+                    return True
+            except Exception as e:
+                current_app.logger.warning(f"Invalid regex pattern '{pattern}': {str(e)}")
+                continue
+
+        return False
+
+    chunking_patterns = json_to_pattern_list(processor.configuration.get('chunking_patterns', []))
+
+    processor.log_tuning(f'Chunking Patterns Extraction: ', {
+        'Full Configuration': processor.configuration,
+        'Chunking Patterns': chunking_patterns,
+    })
+
    for chunk in potential_chunks:
        chunk_length = len(chunk)

+        # Force new chunk if pattern matches
+        if chunking_patterns and matches_chunking_pattern(chunk, chunking_patterns):
+            if current_chunk and current_length >= min_chars:
+                actual_chunks.append(current_chunk)
+            current_chunk = chunk
+            current_length = chunk_length
+            continue
+
        if current_length + chunk_length > max_chars:
            if current_length >= min_chars:
                actual_chunks.append(current_chunk)