- Addition of general chunking parameters chunking_heading_level and chunking patterns

- Addition of Processor types docx and markdown
This commit is contained in:
Josako
2024-12-05 15:19:37 +01:00
parent 311927d5ea
commit d35ec9f5ae
17 changed files with 718 additions and 66 deletions

View File

@@ -1,3 +1,4 @@
import re
from datetime import datetime as dt, timezone as tz
from celery import states
@@ -23,6 +24,8 @@ from common.utils.business_event_context import current_event
from config.type_defs.processor_types import PROCESSOR_TYPES
from eveai_workers.processors.processor_registry import ProcessorRegistry
from common.utils.config_field_types import json_to_pattern_list
# Healthcheck task
@current_celery.task(name='ping', queue='embeddings')
@@ -99,9 +102,13 @@ def create_embeddings(tenant_id, document_version_id):
processor=processor
)
markdown, title = document_processor.process()
document_processor.log_tuning("Processor returned: ", {
'markdown': markdown,
'title': title
})
with current_event.create_span("Embedding"):
embed_markdown(tenant, model_variables, document_version, catalog, markdown, title)
embed_markdown(tenant, model_variables, document_version, catalog, document_processor, markdown, title)
current_event.log("Finished Embedding Creation Task")
@@ -129,16 +136,19 @@ def delete_embeddings_for_document_version(document_version):
raise
def embed_markdown(tenant, model_variables, document_version, catalog, markdown, title):
def embed_markdown(tenant, model_variables, document_version, catalog, processor, markdown, title):
# Create potential chunks
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, f"{document_version.id}.md")
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, processor, markdown)
processor.log_tuning("Potential Chunks: ", {'potential chunks': potential_chunks})
# Combine chunks for embedding
chunks = combine_chunks_for_markdown(potential_chunks, catalog.min_chunk_size, catalog.max_chunk_size)
chunks = combine_chunks_for_markdown(potential_chunks, catalog.min_chunk_size, catalog.max_chunk_size, processor)
processor.log_tuning("Chunks: ", {'chunks': chunks})
# Enrich chunks
with current_event.create_span("Enrich Chunks"):
enriched_chunks = enrich_chunks(tenant, model_variables, document_version, title, chunks)
processor.log_tuning("Enriched Chunks: ", {'enriched_chunks': enriched_chunks})
# Create embeddings
with current_event.create_span("Create Embeddings"):
@@ -238,23 +248,17 @@ def embed_chunks(tenant, model_variables, document_version, chunks):
return new_embeddings
def create_potential_chunks_for_markdown(tenant_id, document_version, input_file):
def create_potential_chunks_for_markdown(tenant_id, document_version, processor, markdown):
try:
current_app.logger.info(f'Creating potential chunks for tenant {tenant_id}')
markdown_on = document_version.object_name.rsplit('.', 1)[0] + '.md'
# Download the markdown file from MinIO
markdown_data = minio_client.download_document_file(tenant_id,
document_version.bucket_name,
markdown_on,
)
markdown = markdown_data.decode('utf-8')
heading_level = processor.configuration.get('chunking_heading_level', 2)
headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
(f"{'#' * i}", f"Header {i}") for i in range(1, min(heading_level + 1, 7))
]
processor.log_tuning('Headers to split on', {'header list: ': headers_to_split_on})
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
md_header_splits = markdown_splitter.split_text(markdown)
potential_chunks = [doc.page_content for doc in md_header_splits]
@@ -265,14 +269,61 @@ def create_potential_chunks_for_markdown(tenant_id, document_version, input_file
raise
def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):
def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars, processor):
actual_chunks = []
current_chunk = ""
current_length = 0
def matches_chunking_pattern(text, patterns):
if not patterns:
return False
# Get the first line of the text
first_line = text.split('\n', 1)[0].strip()
# Check if it's a header at appropriate level
header_match = re.match(r'^(#{1,6})\s+(.+)$', first_line)
if not header_match:
return False
# Get the heading level (number of #s)
header_level = len(header_match.group(1))
# Get the header text
header_text = header_match.group(2)
# Check if header matches any pattern
for pattern in patterns:
try:
processor.log_tuning('Pattern check: ', {
'pattern: ': pattern,
'text': header_text
})
if re.search(pattern, header_text, re.IGNORECASE):
return True
except Exception as e:
current_app.logger.warning(f"Invalid regex pattern '{pattern}': {str(e)}")
continue
return False
chunking_patterns = json_to_pattern_list(processor.configuration.get('chunking_patterns', []))
processor.log_tuning(f'Chunking Patterns Extraction: ', {
'Full Configuration': processor.configuration,
'Chunking Patterns': chunking_patterns,
})
for chunk in potential_chunks:
chunk_length = len(chunk)
# Force new chunk if pattern matches
if chunking_patterns and matches_chunking_pattern(chunk, chunking_patterns):
if current_chunk and current_length >= min_chars:
actual_chunks.append(current_chunk)
current_chunk = chunk
current_length = chunk_length
continue
if current_length + chunk_length > max_chars:
if current_length >= min_chars:
actual_chunks.append(current_chunk)