- Significantly changed the PDF Processor to use Mistral's OCR model

- ensure very long chunks get split into smaller chunks
- ensure TrackedMistralAIEmbedding is batched if needed to ensure correct execution
- upgraded some of the packages to a higher version
This commit is contained in:
Josako
2025-04-16 15:39:16 +02:00
parent 5f58417d24
commit 4bf12db142
10 changed files with 518 additions and 91 deletions

View File

@@ -7,6 +7,7 @@ from langchain_core.prompts import ChatPromptTemplate
import re
from langchain_core.runnables import RunnablePassthrough
from common.eveai_model.tracked_mistral_ocr_client import TrackedMistralOcrClient
from common.extensions import minio_client
from common.utils.model_utils import create_language_template, get_embedding_llm
from .base_processor import BaseProcessor
@@ -21,6 +22,7 @@ class PDFProcessor(BaseProcessor):
self.chunk_size = catalog.max_chunk_size
self.chunk_overlap = 0
self.tuning = self.processor.tuning
self.ocr_client = TrackedMistralOcrClient()
def process(self):
self._log("Starting PDF processing")
@@ -30,14 +32,10 @@ class PDFProcessor(BaseProcessor):
self.document_version.bucket_name,
self.document_version.object_name,
)
with current_event.create_span("PDF Extraction"):
extracted_content = self._extract_content(file_data)
structured_content, title = self._structure_content(extracted_content)
file_name = f"{self.document_version.bucket_name}_{self.document_version.object_name.replace("/", "_")}"
with current_event.create_span("Markdown Generation"):
llm_chunks = self._split_content_for_llm(structured_content)
markdown = self._process_chunks_with_llm(llm_chunks)
markdown, title = self.ocr_client.process_pdf(file_name, file_data)
self._save_markdown(markdown)
self._log("Finished processing PDF")

View File

@@ -144,7 +144,8 @@ def delete_embeddings_for_document_version(document_version):
def embed_markdown(tenant, model_variables, document_version, catalog, processor, markdown, title):
# Create potential chunks
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, processor, markdown)
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, processor, markdown,
catalog.max_chunk_size)
processor.log_tuning("Potential Chunks: ", {'potential chunks': potential_chunks})
# Combine chunks for embedding
@@ -254,27 +255,286 @@ def embed_chunks(tenant, catalog, document_version, chunks):
return new_embeddings
def create_potential_chunks_for_markdown(tenant_id, document_version, processor, markdown):
def create_potential_chunks_for_markdown(tenant_id, document_version, processor, markdown, max_chunk_size=2500):
try:
current_app.logger.info(f'Creating potential chunks for tenant {tenant_id}')
heading_level = processor.configuration.get('chunking_heading_level', 2)
configured_heading_level = processor.configuration.get('chunking_heading_level', 2)
headers_to_split_on = [
(f"{'#' * i}", f"Header {i}") for i in range(1, min(heading_level + 1, 7))
(f"{'#' * i}", f"Header {i}") for i in range(1, min(configured_heading_level + 1, 7))
]
processor.log_tuning('Headers to split on', {'header list: ': headers_to_split_on})
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
md_header_splits = markdown_splitter.split_text(markdown)
potential_chunks = [doc.page_content for doc in md_header_splits]
initial_chunks = [doc.page_content for doc in md_header_splits]
final_chunks = []
for chunk in initial_chunks:
if len(chunk) <= max_chunk_size:
final_chunks.append(chunk)
else:
# This chunk is too large, split it further
processor.log_tuning('Further splitting required', {
'chunk_size': len(chunk),
'max_chunk_size': max_chunk_size
})
return potential_chunks
# Try splitting on deeper heading levels first
deeper_chunks = split_on_deeper_headings(chunk, configured_heading_level, max_chunk_size)
# If deeper heading splits still exceed max size, split on paragraphs
chunks_to_process = []
for deeper_chunk in deeper_chunks:
if len(deeper_chunk) <= max_chunk_size:
chunks_to_process.append(deeper_chunk)
else:
paragraph_chunks = split_on_paragraphs(deeper_chunk, max_chunk_size)
chunks_to_process.extend(paragraph_chunks)
final_chunks.extend(chunks_to_process)
processor.log_tuning('Final chunks', {
'initial_chunk_count': len(initial_chunks),
'final_chunk_count': len(final_chunks)
})
return final_chunks
except Exception as e:
current_app.logger.error(f'Error creating potential chunks for tenant {tenant_id}, with error: {e}')
raise
def split_on_deeper_headings(chunk, already_split_level, max_chunk_size):
"""
Split a chunk on deeper heading levels than already used
Args:
chunk: Markdown chunk to split
already_split_level: Heading level already used for splitting
max_chunk_size: Maximum allowed chunk size
Returns:
List of chunks split on deeper headings
"""
# Define headers for deeper levels
deeper_headers = [
(f"{'#' * i}", f"Header {i}") for i in range(already_split_level + 1, 7)
]
if not deeper_headers:
# No deeper headers possible, return original chunk
return [chunk]
splitter = MarkdownHeaderTextSplitter(deeper_headers, strip_headers=False)
try:
splits = splitter.split_text(chunk)
return [doc.page_content for doc in splits]
except Exception:
# If splitting fails, return original chunk
return [chunk]
def split_on_paragraphs(chunk, max_chunk_size):
"""
Split a chunk on paragraph boundaries, preserving tables
Args:
chunk: Markdown chunk to split
max_chunk_size: Maximum allowed chunk size
Returns:
List of chunks split on paragraph boundaries
"""
# Split the chunk into parts: regular paragraphs and tables
parts = []
current_part = ""
in_table = False
table_content = ""
lines = chunk.split('\n')
for i, line in enumerate(lines):
# Check if this line starts a table
if line.strip().startswith('|') and not in_table:
# Add current content as a part if not empty
if current_part.strip():
parts.append(('text', current_part))
current_part = ""
in_table = True
table_content = line + '\n'
# Check if we're in a table
elif in_table:
table_content += line + '\n'
# Check if this line might end the table (empty line after a table line)
if not line.strip() and i > 0 and lines[i - 1].strip().startswith('|'):
parts.append(('table', table_content))
table_content = ""
in_table = False
# Regular content
else:
current_part += line + '\n'
# If we have a blank line, it's a paragraph boundary
if not line.strip() and current_part.strip():
parts.append(('text', current_part))
current_part = ""
# Handle any remaining content
if in_table and table_content.strip():
parts.append(('table', table_content))
elif current_part.strip():
parts.append(('text', current_part))
# Now combine parts into chunks that respect max_chunk_size
result_chunks = []
current_chunk = ""
for part_type, content in parts:
# If it's a table, we don't want to split it
if part_type == 'table':
# If adding the table would exceed max size, start a new chunk
if len(current_chunk) + len(content) > max_chunk_size:
if current_chunk:
result_chunks.append(current_chunk)
# If the table itself exceeds max size, we have to split it anyway
if len(content) > max_chunk_size:
# Split table into multiple chunks, trying to keep rows together
table_chunks = split_table(content, max_chunk_size)
result_chunks.extend(table_chunks)
else:
current_chunk = content
else:
current_chunk += content
# For text parts, we can split more freely
else:
# If text is smaller than max size, try to add it
if len(content) <= max_chunk_size:
if len(current_chunk) + len(content) <= max_chunk_size:
current_chunk += content
else:
result_chunks.append(current_chunk)
current_chunk = content
else:
# Text part is too large, split it into paragraphs
if current_chunk:
result_chunks.append(current_chunk)
current_chunk = ""
# Split by paragraphs (blank lines)
paragraphs = content.split('\n\n')
for paragraph in paragraphs:
paragraph_with_newlines = paragraph + '\n\n'
if len(paragraph_with_newlines) > max_chunk_size:
# This single paragraph is too large, split by sentences
sentences = re.split(r'(?<=[.!?])\s+', paragraph)
current_sentence_chunk = ""
for sentence in sentences:
sentence_with_space = sentence + ' '
if len(current_sentence_chunk) + len(sentence_with_space) <= max_chunk_size:
current_sentence_chunk += sentence_with_space
else:
if current_sentence_chunk:
result_chunks.append(current_sentence_chunk.strip())
# If single sentence exceeds max size, we have to split it
if len(sentence_with_space) > max_chunk_size:
# Split sentence into chunks of max_chunk_size
for i in range(0, len(sentence_with_space), max_chunk_size):
result_chunks.append(sentence_with_space[i:i + max_chunk_size].strip())
else:
current_sentence_chunk = sentence_with_space
if current_sentence_chunk:
result_chunks.append(current_sentence_chunk.strip())
elif len(current_chunk) + len(paragraph_with_newlines) <= max_chunk_size:
current_chunk += paragraph_with_newlines
else:
if current_chunk:
result_chunks.append(current_chunk.strip())
current_chunk = paragraph_with_newlines
# Add the last chunk if there's anything left
if current_chunk:
result_chunks.append(current_chunk.strip())
return result_chunks
def split_table(table_content, max_chunk_size):
"""
Split a table into multiple chunks, trying to keep rows together
Args:
table_content: Markdown table content
max_chunk_size: Maximum allowed chunk size
Returns:
List of table chunks
"""
lines = table_content.split('\n')
header_rows = []
# Find the header rows (usually first two rows: content and separator)
for i, line in enumerate(lines):
if i < 2 and line.strip().startswith('|'):
header_rows.append(line)
elif i == 2:
break
header = '\n'.join(header_rows) + '\n' if header_rows else ''
# If even the header is too big, we have a problem
if len(header) > max_chunk_size:
# Just split the table content regardless of rows
chunks = []
current_chunk = ""
for line in lines:
if len(current_chunk) + len(line) + 1 <= max_chunk_size:
current_chunk += line + '\n'
else:
chunks.append(current_chunk)
current_chunk = line + '\n'
if current_chunk:
chunks.append(current_chunk)
return chunks
# Split the table with proper headers
chunks = []
current_chunk = header
for i, line in enumerate(lines):
# Skip header rows
if i < len(header_rows):
continue
# If this row fits, add it
if len(current_chunk) + len(line) + 1 <= max_chunk_size:
current_chunk += line + '\n'
else:
# This row doesn't fit, start a new chunk
chunks.append(current_chunk)
current_chunk = header + line + '\n'
if current_chunk != header:
chunks.append(current_chunk)
return chunks
def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars, processor):
actual_chunks = []
current_chunk = ""
@@ -325,6 +585,7 @@ def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars, processo
# Force new chunk if pattern matches
if chunking_patterns and matches_chunking_pattern(chunk, chunking_patterns):
if current_chunk and current_length >= min_chars:
current_app.logger.debug(f"Chunk Length of chunk to embed: {len(current_chunk)} ")
actual_chunks.append(current_chunk)
current_chunk = chunk
current_length = chunk_length
@@ -332,6 +593,7 @@ def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars, processo
if current_length + chunk_length > max_chars:
if current_length >= min_chars:
current_app.logger.debug(f"Chunk Length of chunk to embed: {len(current_chunk)} ")
actual_chunks.append(current_chunk)
current_chunk = chunk
current_length = chunk_length
@@ -345,6 +607,7 @@ def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars, processo
# Handle the last chunk
if current_chunk and current_length >= 0:
current_app.logger.debug(f"Chunk Length of chunk to embed: {len(current_chunk)} ")
actual_chunks.append(current_chunk)
return actual_chunks