- Addition of general chunking parameters chunking_heading_level and chunking patterns
- Addition of Processor types docx and markdown
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
import re
|
||||
from datetime import datetime as dt, timezone as tz
|
||||
|
||||
from celery import states
|
||||
@@ -23,6 +24,8 @@ from common.utils.business_event_context import current_event
|
||||
from config.type_defs.processor_types import PROCESSOR_TYPES
|
||||
from eveai_workers.processors.processor_registry import ProcessorRegistry
|
||||
|
||||
from common.utils.config_field_types import json_to_pattern_list
|
||||
|
||||
|
||||
# Healthcheck task
|
||||
@current_celery.task(name='ping', queue='embeddings')
|
||||
@@ -99,9 +102,13 @@ def create_embeddings(tenant_id, document_version_id):
|
||||
processor=processor
|
||||
)
|
||||
markdown, title = document_processor.process()
|
||||
document_processor.log_tuning("Processor returned: ", {
|
||||
'markdown': markdown,
|
||||
'title': title
|
||||
})
|
||||
|
||||
with current_event.create_span("Embedding"):
|
||||
embed_markdown(tenant, model_variables, document_version, catalog, markdown, title)
|
||||
embed_markdown(tenant, model_variables, document_version, catalog, document_processor, markdown, title)
|
||||
|
||||
current_event.log("Finished Embedding Creation Task")
|
||||
|
||||
@@ -129,16 +136,19 @@ def delete_embeddings_for_document_version(document_version):
|
||||
raise
|
||||
|
||||
|
||||
def embed_markdown(tenant, model_variables, document_version, catalog, markdown, title):
|
||||
def embed_markdown(tenant, model_variables, document_version, catalog, processor, markdown, title):
|
||||
# Create potential chunks
|
||||
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, f"{document_version.id}.md")
|
||||
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, processor, markdown)
|
||||
processor.log_tuning("Potential Chunks: ", {'potential chunks': potential_chunks})
|
||||
|
||||
# Combine chunks for embedding
|
||||
chunks = combine_chunks_for_markdown(potential_chunks, catalog.min_chunk_size, catalog.max_chunk_size)
|
||||
chunks = combine_chunks_for_markdown(potential_chunks, catalog.min_chunk_size, catalog.max_chunk_size, processor)
|
||||
processor.log_tuning("Chunks: ", {'chunks': chunks})
|
||||
|
||||
# Enrich chunks
|
||||
with current_event.create_span("Enrich Chunks"):
|
||||
enriched_chunks = enrich_chunks(tenant, model_variables, document_version, title, chunks)
|
||||
processor.log_tuning("Enriched Chunks: ", {'enriched_chunks': enriched_chunks})
|
||||
|
||||
# Create embeddings
|
||||
with current_event.create_span("Create Embeddings"):
|
||||
@@ -238,23 +248,17 @@ def embed_chunks(tenant, model_variables, document_version, chunks):
|
||||
return new_embeddings
|
||||
|
||||
|
||||
def create_potential_chunks_for_markdown(tenant_id, document_version, input_file):
|
||||
def create_potential_chunks_for_markdown(tenant_id, document_version, processor, markdown):
|
||||
try:
|
||||
current_app.logger.info(f'Creating potential chunks for tenant {tenant_id}')
|
||||
markdown_on = document_version.object_name.rsplit('.', 1)[0] + '.md'
|
||||
|
||||
# Download the markdown file from MinIO
|
||||
markdown_data = minio_client.download_document_file(tenant_id,
|
||||
document_version.bucket_name,
|
||||
markdown_on,
|
||||
)
|
||||
markdown = markdown_data.decode('utf-8')
|
||||
heading_level = processor.configuration.get('chunking_heading_level', 2)
|
||||
|
||||
headers_to_split_on = [
|
||||
("#", "Header 1"),
|
||||
("##", "Header 2"),
|
||||
(f"{'#' * i}", f"Header {i}") for i in range(1, min(heading_level + 1, 7))
|
||||
]
|
||||
|
||||
processor.log_tuning('Headers to split on', {'header list: ': headers_to_split_on})
|
||||
|
||||
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
|
||||
md_header_splits = markdown_splitter.split_text(markdown)
|
||||
potential_chunks = [doc.page_content for doc in md_header_splits]
|
||||
@@ -265,14 +269,61 @@ def create_potential_chunks_for_markdown(tenant_id, document_version, input_file
|
||||
raise
|
||||
|
||||
|
||||
def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):
|
||||
def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars, processor):
|
||||
actual_chunks = []
|
||||
current_chunk = ""
|
||||
current_length = 0
|
||||
|
||||
def matches_chunking_pattern(text, patterns):
|
||||
if not patterns:
|
||||
return False
|
||||
|
||||
# Get the first line of the text
|
||||
first_line = text.split('\n', 1)[0].strip()
|
||||
|
||||
# Check if it's a header at appropriate level
|
||||
header_match = re.match(r'^(#{1,6})\s+(.+)$', first_line)
|
||||
if not header_match:
|
||||
return False
|
||||
|
||||
# Get the heading level (number of #s)
|
||||
header_level = len(header_match.group(1))
|
||||
# Get the header text
|
||||
header_text = header_match.group(2)
|
||||
|
||||
# Check if header matches any pattern
|
||||
for pattern in patterns:
|
||||
try:
|
||||
processor.log_tuning('Pattern check: ', {
|
||||
'pattern: ': pattern,
|
||||
'text': header_text
|
||||
})
|
||||
if re.search(pattern, header_text, re.IGNORECASE):
|
||||
return True
|
||||
except Exception as e:
|
||||
current_app.logger.warning(f"Invalid regex pattern '{pattern}': {str(e)}")
|
||||
continue
|
||||
|
||||
return False
|
||||
|
||||
chunking_patterns = json_to_pattern_list(processor.configuration.get('chunking_patterns', []))
|
||||
|
||||
processor.log_tuning(f'Chunking Patterns Extraction: ', {
|
||||
'Full Configuration': processor.configuration,
|
||||
'Chunking Patterns': chunking_patterns,
|
||||
})
|
||||
|
||||
for chunk in potential_chunks:
|
||||
chunk_length = len(chunk)
|
||||
|
||||
# Force new chunk if pattern matches
|
||||
if chunking_patterns and matches_chunking_pattern(chunk, chunking_patterns):
|
||||
if current_chunk and current_length >= min_chars:
|
||||
actual_chunks.append(current_chunk)
|
||||
current_chunk = chunk
|
||||
current_length = chunk_length
|
||||
continue
|
||||
|
||||
if current_length + chunk_length > max_chars:
|
||||
if current_length >= min_chars:
|
||||
actual_chunks.append(current_chunk)
|
||||
|
||||
Reference in New Issue
Block a user