- Addition of general chunking parameters chunking_heading_level and chunking patterns

- Addition of Processor types docx and markdown
This commit is contained in:
Josako
2024-12-05 15:19:37 +01:00
parent 311927d5ea
commit d35ec9f5ae
17 changed files with 718 additions and 66 deletions

View File

@@ -64,6 +64,20 @@ class TaggingFields(BaseModel):
}
class ChunkingPatternsField(BaseModel):
"""Represents a set of chunking patterns"""
patterns: List[str]
@field_validator('patterns')
def validate_patterns(cls, patterns):
for pattern in patterns:
try:
re.compile(pattern)
except re.error as e:
raise ValueError(f"Invalid regex pattern '{pattern}': {str(e)}")
return patterns
class ArgumentConstraint(BaseModel):
"""Base class for all argument constraints"""
description: Optional[str] = None
@@ -611,3 +625,38 @@ def _generate_yaml_docs(fields: Dict[str, Any], version: str) -> str:
}
return yaml.dump(doc, sort_keys=False, default_flow_style=False)
def patterns_to_json(text_area_content: str) -> str:
"""Convert line-based patterns to JSON"""
text_area_content = text_area_content.strip()
if len(text_area_content) == 0:
return json.dumps([])
# Split on newlines and remove empty lines
patterns = [line.strip() for line in text_area_content.split('\n') if line.strip()]
return json.dumps(patterns)
def json_to_patterns(json_content: str) -> str:
"""Convert JSON patterns list to text area content"""
try:
patterns = json.loads(json_content)
if not isinstance(patterns, list):
raise ValueError("JSON must contain a list of patterns")
# Join with newlines
return '\n'.join(patterns)
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON format: {e}")
def json_to_pattern_list(json_content: str) -> list:
"""Convert JSON patterns list to text area content"""
try:
patterns = json.loads(json_content)
if not isinstance(patterns, list):
raise ValueError("JSON must contain a list of patterns")
# Unescape if needed
patterns = [pattern.replace('\\\\', '\\') for pattern in patterns]
return patterns
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON format: {e}")

View File

@@ -12,7 +12,7 @@ import requests
from urllib.parse import urlparse, unquote, urlunparse
import os
from .eveai_exceptions import (EveAIInvalidLanguageException, EveAIDoubleURLException, EveAIUnsupportedFileType,
EveAIInvalidCatalog, EveAIInvalidDocument, EveAIInvalidDocumentVersion)
EveAIInvalidCatalog, EveAIInvalidDocument, EveAIInvalidDocumentVersion, EveAIException)
from ..models.user import Tenant
@@ -219,12 +219,6 @@ def start_embedding_task(tenant_id, doc_vers_id):
return task.id
def validate_file_type(extension):
if extension not in current_app.config['SUPPORTED_FILE_TYPES']:
raise EveAIUnsupportedFileType(f"Filetype {extension} is currently not supported. "
f"Supported filetypes: {', '.join(current_app.config['SUPPORTED_FILE_TYPES'])}")
def get_filename_from_url(url):
parsed_url = urlparse(url)
path_parts = parsed_url.path.split('/')
@@ -363,3 +357,109 @@ def cope_with_local_url(url):
return url
def lookup_document(tenant_id: int, lookup_criteria: dict, metadata_type: str) -> tuple[Document, DocumentVersion]:
"""
Look up a document using metadata criteria
Args:
tenant_id: ID of the tenant
lookup_criteria: Dictionary of key-value pairs to match in metadata
metadata_type: Which metadata to search in ('user_metadata' or 'system_metadata')
Returns:
Tuple of (Document, DocumentVersion) if found
Raises:
ValueError: If invalid metadata_type provided
EveAIException: If lookup fails
"""
if metadata_type not in ['user_metadata', 'system_metadata']:
raise ValueError(f"Invalid metadata_type: {metadata_type}")
try:
# Query for the latest document version matching the criteria
query = (db.session.query(Document, DocumentVersion)
.join(DocumentVersion)
.filter(Document.id == DocumentVersion.doc_id)
.order_by(DocumentVersion.id.desc()))
# Add metadata filtering using PostgreSQL JSONB operators
metadata_field = getattr(DocumentVersion, metadata_type)
for key, value in lookup_criteria.items():
query = query.filter(metadata_field[key].astext == str(value))
# Get first result
result = query.first()
if not result:
raise EveAIException(
f"No document found matching criteria in {metadata_type}",
status_code=404
)
return result
except SQLAlchemyError as e:
current_app.logger.error(f'Database error during document lookup for tenant {tenant_id}: {e}')
raise EveAIException(
"Database error during document lookup",
status_code=500
)
except Exception as e:
current_app.logger.error(f'Error during document lookup for tenant {tenant_id}: {e}')
raise EveAIException(
"Error during document lookup",
status_code=500
)
# Add to common/utils/document_utils.py
def refresh_document_with_content(doc_id: int, tenant_id: int, file_content: bytes, api_input: dict) -> tuple:
"""
Refresh document with new content
Args:
doc_id: Document ID
tenant_id: Tenant ID
file_content: New file content
api_input: Additional document information
Returns:
Tuple of (new_version, task_id)
"""
doc = Document.query.get(doc_id)
if not doc:
raise EveAIInvalidDocument(tenant_id, doc_id)
old_doc_vers = DocumentVersion.query.filter_by(doc_id=doc_id).order_by(desc(DocumentVersion.id)).first()
# Create new version with same file type as original
extension = old_doc_vers.file_type
new_doc_vers = create_version_for_document(
doc, tenant_id,
'', # No URL for content-based updates
old_doc_vers.sub_file_type,
api_input.get('language', old_doc_vers.language),
api_input.get('user_context', old_doc_vers.user_context),
api_input.get('user_metadata', old_doc_vers.user_metadata),
api_input.get('catalog_properties', old_doc_vers.catalog_properties),
)
try:
db.session.add(new_doc_vers)
db.session.commit()
except SQLAlchemyError as e:
db.session.rollback()
return None, str(e)
# Upload new content
upload_file_for_version(new_doc_vers, file_content, extension, tenant_id)
# Start embedding task
task = current_celery.send_task('create_embeddings', args=[tenant_id, new_doc_vers.id], queue='embeddings')
current_app.logger.info(f'Embedding creation started for document {doc_id} on version {new_doc_vers.id} '
f'with task id: {task.id}.')
return new_doc_vers, task.id

View File

@@ -55,7 +55,6 @@ class Config(object):
# file upload settings
MAX_CONTENT_LENGTH = 50 * 1024 * 1024
UPLOAD_EXTENSIONS = ['.txt', '.pdf', '.png', '.jpg', '.jpeg', '.gif']
# supported languages
SUPPORTED_LANGUAGES = ['en', 'fr', 'nl', 'de', 'es']
@@ -143,10 +142,7 @@ class Config(object):
LANGCHAIN_ENDPOINT = 'https://api.smith.langchain.com'
LANGCHAIN_PROJECT = "eveai"
SUPPORTED_FILE_TYPES = ['pdf', 'html', 'md', 'txt', 'mp3', 'mp4', 'ogg', 'srt']
TENANT_TYPES = ['Active', 'Demo', 'Inactive', 'Test', 'Wordpress Starter']
TENANT_TYPES = ['Active', 'Demo', 'Inactive', 'Test']
# The maximum number of seconds allowed for audio compression (to save resources)
MAX_COMPRESSION_DURATION = 60*10 # 10 minutes

View File

@@ -5,6 +5,19 @@ PROCESSOR_TYPES = {
"file_types": "html",
"Description": "A processor for HTML files",
"configuration": {
"chunking_patterns": {
"name": "Chunking Patterns",
"description": "A list of Patterns used to chunk files into logical pieces",
"type": "chunking_patterns",
"required": False
},
"chunking_heading_level": {
"name": "Chunking Heading Level",
"type": "integer",
"description": "Maximum heading level to consider for chunking (1-6)",
"required": False,
"default": 2
},
"html_tags": {
"name": "HTML Tags",
"type": "string",
@@ -45,7 +58,21 @@ PROCESSOR_TYPES = {
"name": "PDF Processor",
"file_types": "pdf",
"Description": "A Processor for PDF files",
"configuration": {}
"configuration": {
"chunking_patterns": {
"name": "Chunking Patterns",
"description": "A list of Patterns used to chunk files into logical pieces",
"type": "chunking_patterns",
"required": False
},
"chunking_heading_level": {
"name": "Chunking Heading Level",
"type": "integer",
"description": "Maximum heading level to consider for chunking (1-6)",
"required": False,
"default": 2
},
},
},
"AUDIO_PROCESSOR": {
"name": "AUDIO Processor",
@@ -53,4 +80,89 @@ PROCESSOR_TYPES = {
"Description": "A Processor for audio files",
"configuration": {}
},
"MARKDOWN_PROCESSOR": {
"name": "Markdown Processor",
"file_types": "md",
"Description": "A Processor for markdown files",
"configuration": {
"chunking_patterns": {
"name": "Chunking Patterns",
"description": "A list of Patterns used to chunk files into logical pieces",
"type": "chunking_patterns",
"required": False
},
"chunking_heading_level": {
"name": "Chunking Heading Level",
"type": "integer",
"description": "Maximum heading level to consider for chunking (1-6)",
"required": False,
"default": 2
},
}
},
"DOCX_PROCESSOR": {
"name": "DOCX Processor",
"file_types": "docx",
"Description": "A processor for DOCX files",
"configuration": {
"chunking_patterns": {
"name": "Chunking Patterns",
"description": "A list of Patterns used to chunk files into logical pieces",
"type": "chunking_patterns",
"required": False
},
"chunking_heading_level": {
"name": "Chunking Heading Level",
"type": "integer",
"description": "Maximum heading level to consider for chunking (1-6)",
"required": False,
"default": 2
},
"extract_comments": {
"name": "Extract Comments",
"type": "boolean",
"description": "Whether to include document comments in the markdown",
"required": False,
"default": False
},
"extract_headers_footers": {
"name": "Extract Headers/Footers",
"type": "boolean",
"description": "Whether to include headers and footers in the markdown",
"required": False,
"default": False
},
"preserve_formatting": {
"name": "Preserve Formatting",
"type": "boolean",
"description": "Whether to preserve bold, italic, and other text formatting",
"required": False,
"default": True
},
"list_style": {
"name": "List Style",
"type": "enum",
"description": "How to format lists in markdown",
"required": False,
"default": "dash",
"allowed_values": ["dash", "asterisk", "plus"]
},
"image_handling": {
"name": "Image Handling",
"type": "enum",
"description": "How to handle embedded images",
"required": False,
"default": "skip",
"allowed_values": ["skip", "extract", "placeholder"]
},
"table_alignment": {
"name": "Table Alignment",
"type": "enum",
"description": "How to align table contents",
"required": False,
"default": "left",
"allowed_values": ["left", "center", "preserve"]
}
}
}
}

View File

@@ -11,9 +11,10 @@ from common.utils.document_utils import (
create_document_stack, process_url, start_embedding_task,
validate_file_type, EveAIInvalidLanguageException, EveAIDoubleURLException, EveAIUnsupportedFileType,
get_documents_list, edit_document, refresh_document, edit_document_version,
refresh_document_with_info
refresh_document_with_info, lookup_document
)
from common.utils.eveai_exceptions import EveAIException
from eveai_api.api.auth import requires_service
def validate_date(date_str):
@@ -59,6 +60,7 @@ add_document_response = document_ns.model('AddDocumentResponse', {
@document_ns.route('/add_document')
class AddDocument(Resource):
@jwt_required()
@requires_service('DOCAPI')
@document_ns.expect(upload_parser)
@document_ns.response(201, 'Document added successfully', add_document_response)
@document_ns.response(400, 'Validation Error')
@@ -134,6 +136,7 @@ add_url_response = document_ns.model('AddURLResponse', {
@document_ns.route('/add_url')
class AddURL(Resource):
@jwt_required()
@requires_service('DOCAPI')
@document_ns.expect(add_url_model)
@document_ns.response(201, 'Document added successfully', add_url_response)
@document_ns.response(400, 'Validation Error')
@@ -190,6 +193,7 @@ document_list_model = document_ns.model('DocumentList', {
@document_ns.route('/list')
class DocumentList(Resource):
@jwt_required()
@requires_service('DOCAPI')
@document_ns.doc('list_documents')
@document_ns.marshal_list_with(document_list_model, envelope='documents')
def get(self):
@@ -210,6 +214,7 @@ edit_document_model = document_ns.model('EditDocument', {
@document_ns.route('/<int:document_id>')
class DocumentResource(Resource):
@jwt_required()
@requires_service('DOCAPI')
@document_ns.doc('edit_document')
@document_ns.expect(edit_document_model)
@document_ns.response(200, 'Document updated successfully')
@@ -232,6 +237,7 @@ class DocumentResource(Resource):
return e.to_dict(), e.status_code
@jwt_required()
@requires_service('DOCAPI')
@document_ns.doc('refresh_document')
@document_ns.response(200, 'Document refreshed successfully')
def post(self, document_id):
@@ -253,6 +259,7 @@ edit_document_version_model = document_ns.model('EditDocumentVersion', {
@document_ns.route('/version/<int:version_id>')
class DocumentVersionResource(Resource):
@jwt_required()
@requires_service('DOCAPI')
@document_ns.doc('edit_document_version')
@document_ns.expect(edit_document_version_model)
@document_ns.response(200, 'Document version updated successfully')
@@ -280,6 +287,7 @@ refresh_document_model = document_ns.model('RefreshDocument', {
@document_ns.route('/<int:document_id>/refresh')
class RefreshDocument(Resource):
@jwt_required()
@requires_service('DOCAPI')
@document_ns.response(200, 'Document refreshed successfully')
@document_ns.response(404, 'Document not found')
def post(self, document_id):
@@ -310,6 +318,7 @@ class RefreshDocument(Resource):
@document_ns.route('/<int:document_id>/refresh_with_info')
class RefreshDocumentWithInfo(Resource):
@jwt_required()
@requires_service('DOCAPI')
@document_ns.expect(refresh_document_model)
@document_ns.response(200, 'Document refreshed successfully')
@document_ns.response(400, 'Validation Error')
@@ -338,3 +347,112 @@ class RefreshDocumentWithInfo(Resource):
except Exception as e:
current_app.logger.error(f'Error refreshing document with info: {str(e)}')
return {'message': 'Internal server error'}, 500
# Define models for lookup requests
lookup_model = document_ns.model('DocumentLookup', {
'lookup_criteria': fields.Raw(required=True,
description='JSON object containing key-value pairs to match in metadata. '
'Example: {"external_id": "123", "source": "zapier", "source_type": "google_docs"}'),
'metadata_type': fields.String(required=True, enum=['user_metadata', 'system_metadata'],
description='Which metadata field to search in')
})
lookup_response = document_ns.model('DocumentLookupResponse', {
'document_id': fields.Integer(description='ID of the found document'),
'document_version_id': fields.Integer(description='ID of the latest document version'),
'name': fields.String(description='Document name'),
'metadata': fields.Raw(description='Full metadata of the found document')
})
@document_ns.route('/lookup')
class DocumentLookup(Resource):
@jwt_required()
@requires_service('DOCAPI')
@document_ns.expect(lookup_model)
@document_ns.marshal_with(lookup_response)
@document_ns.response(200, 'Document found', lookup_response)
@document_ns.response(404, 'No document found matching criteria')
def post(self):
"""
Look up a document using metadata criteria
"""
tenant_id = get_jwt_identity()
try:
data = request.json
document, version = lookup_document(
tenant_id,
data['lookup_criteria'],
data['metadata_type']
)
return {
'document_id': document.id,
'document_version_id': version.id,
'name': document.name,
'metadata': getattr(version, data['metadata_type'])
}
except EveAIException as e:
return e.to_dict(), e.status_code
except KeyError as e:
return {'message': f'Missing required field: {str(e)}'}, 400
refresh_content_model = document_ns.model('RefreshDocumentContent', {
'file_content': fields.Raw(required=True, description='The new file content'),
'language': fields.String(required=False, description='Language of the document'),
'user_context': fields.String(required=False, description='User context for the document'),
'user_metadata': fields.Raw(required=False, description='Custom metadata fields'),
'catalog_properties': fields.Raw(required=False, description='Catalog-specific properties'),
'trigger_service': fields.String(required=False, description='Service that triggered the update')
})
@document_ns.route('/<int:document_id>/refresh_content')
class RefreshDocumentContent(Resource):
@jwt_required()
@requires_service('DOCAPI')
@document_ns.expect(refresh_content_model)
@document_ns.response(200, 'Document refreshed successfully')
def post(self, document_id):
"""Refresh a document with new content"""
tenant_id = get_jwt_identity()
try:
data = request.json
file_content = data['file_content']
# Build user_metadata by merging:
# 1. Existing metadata (if any)
# 2. New metadata from request
# 3. Zapier-specific fields
user_metadata = data.get('user_metadata', {})
user_metadata.update({
'source': 'zapier',
'trigger_service': data.get('trigger_service')
})
data['user_metadata'] = user_metadata
# Keep catalog_properties separate
if 'catalog_properties' in data:
# We could add validation here against catalog configuration
data['catalog_properties'] = data['catalog_properties']
new_version, task_id = refresh_document_with_content(
document_id,
tenant_id,
file_content,
data
)
return {
'message': f'Document refreshed successfully. New version: {new_version.id}. Task ID: {task_id}',
'document_id': document_id,
'document_version_id': new_version.id,
'task_id': task_id
}, 200
except EveAIException as e:
return e.to_dict(), e.status_code

View File

@@ -15,14 +15,6 @@ from config.type_defs.retriever_types import RETRIEVER_TYPES
from .dynamic_form_base import DynamicFormBase
def allowed_file(form, field):
if field.data:
filename = field.data.filename
allowed_extensions = current_app.config.get('SUPPORTED_FILE_TYPES', [])
if not ('.' in filename and filename.rsplit('.', 1)[1].lower() in allowed_extensions):
raise ValidationError('Unsupported file type.')
def validate_json(form, field):
if field.data:
try:
@@ -101,7 +93,10 @@ class ProcessorForm(FlaskForm):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Dynamically populate the 'type' field using the constructor
self.type.choices = [(key, value['name']) for key, value in PROCESSOR_TYPES.items()]
self.type.choices = sorted(
[(key, value['name']) for key, value in PROCESSOR_TYPES.items()],
key=lambda x: x[1],
)
class EditProcessorForm(DynamicFormBase):
@@ -177,7 +172,7 @@ class EditRetrieverForm(DynamicFormBase):
class AddDocumentForm(DynamicFormBase):
file = FileField('File', validators=[FileRequired(), allowed_file])
file = FileField('File', validators=[FileRequired()])
catalog = StringField('Catalog', render_kw={'readonly': True})
sub_file_type = StringField('Sub File Type', validators=[Optional(), Length(max=50)])
name = StringField('Name', validators=[Length(max=100)])

View File

@@ -14,7 +14,7 @@ import json
from common.models.document import Document, DocumentVersion, Catalog, Retriever, Processor
from common.extensions import db
from common.models.interaction import Specialist, SpecialistRetriever
from common.utils.document_utils import validate_file_type, create_document_stack, start_embedding_task, process_url, \
from common.utils.document_utils import create_document_stack, start_embedding_task, process_url, \
edit_document, \
edit_document_version, refresh_document
from common.utils.eveai_exceptions import EveAIInvalidLanguageException, EveAIUnsupportedFileType, \
@@ -391,9 +391,6 @@ def add_document():
sub_file_type = form.sub_file_type.data
filename = secure_filename(file.filename)
extension = filename.rsplit('.', 1)[1].lower()
validate_file_type(extension)
catalog_properties = {}
document_version_configurations = CATALOG_TYPES[catalog.type]['document_version_configurations']
for config in document_version_configurations:

View File

@@ -5,7 +5,46 @@ import json
from wtforms.fields.choices import SelectField
from wtforms.fields.datetime import DateField
from common.utils.config_field_types import TaggingFields
from common.utils.config_field_types import TaggingFields, json_to_patterns, patterns_to_json
class TaggingFieldsField(TextAreaField):
def __init__(self, *args, **kwargs):
kwargs['render_kw'] = {
'class': 'chunking-patterns-field',
'data-handle-enter': 'true'
}
super().__init__(*args, **kwargs)
# def _value(self):
# if self.data:
# return json.dumps(self.data)
# return ''
#
# def process_formdata(self, valuelist):
# if valuelist and valuelist[0]:
# try:
# self.data = json.loads(valuelist[0])
# except json.JSONDecodeError as e:
# raise ValueError('Not valid JSON content')
class ChunkingPatternsField(TextAreaField):
def __init__(self, *args, **kwargs):
kwargs['render_kw'] = {
'class': 'chunking-patterns-field',
'data-handle-enter': 'true'
}
super().__init__(*args, **kwargs)
# def _value(self):
# if self.data:
# return '\n'.join(self.data)
# return ''
#
# def process_formdata(self, valuelist):
# if valuelist and valuelist[0]:
# self.data = [line.strip() for line in valuelist[0].split('\n') if line.strip()]
class DynamicFormBase(FlaskForm):
@@ -80,7 +119,7 @@ class DynamicFormBase(FlaskForm):
# Handle special case for tagging_fields
if field_type == 'tagging_fields':
field_class = TextAreaField
field_class = TaggingFieldsField
extra_classes = 'json-editor'
field_kwargs = {}
elif field_type == 'enum':
@@ -89,6 +128,10 @@ class DynamicFormBase(FlaskForm):
choices = [(str(val), str(val)) for val in allowed_values]
extra_classes = ''
field_kwargs = {'choices': choices}
elif field_type == 'chunking_patterns':
field_class = ChunkingPatternsField
extra_classes = ['monospace-text', 'pattern-input']
field_kwargs = {}
else:
extra_classes = ''
field_class = {
@@ -111,6 +154,12 @@ class DynamicFormBase(FlaskForm):
except (TypeError, ValueError) as e:
current_app.logger.error(f"Error converting initial data to JSON: {e}")
field_data = "{}"
elif field_type == 'chunking_patterns':
try:
field_data = json_to_patterns(field_data)
except (TypeError, ValueError) as e:
current_app.logger.error(f"Error converting initial data to a list of patterns: {e}")
field_data = {}
elif default is not None:
field_data = default
@@ -173,12 +222,17 @@ class DynamicFormBase(FlaskForm):
original_field_name = full_field_name[prefix_length:]
field = getattr(self, full_field_name)
# Parse JSON for tagging_fields type
if isinstance(field, TextAreaField) and field.data:
if isinstance(field, TaggingFieldsField) and field.data:
try:
data[original_field_name] = json.loads(field.data)
except json.JSONDecodeError:
# Validation should catch this, but just in case
data[original_field_name] = field.data
elif isinstance(field, ChunkingPatternsField):
try:
data[original_field_name] = patterns_to_json(field.data)
except Exception as e:
current_app.logger.error(f"Error converting initial data to patterns: {e}")
else:
data[original_field_name] = field.data
return data
@@ -230,5 +284,3 @@ def validate_tagging_fields(form, field):
except (TypeError, ValueError) as e:
raise ValidationError(f"Invalid field definition: {str(e)}")

View File

@@ -46,7 +46,7 @@ class AudioProcessor(TranscriptionBaseProcessor):
try:
audio_info = AudioSegment.from_file(temp_file_path, format=self.document_version.file_type)
total_duration = len(audio_info)
self._log_tuning("_compress_audio", {
self.log_tuning("_compress_audio", {
"Audio Duration (ms)": total_duration,
})
segment_length = self.max_compression_duration * 1000 # Convert to milliseconds
@@ -55,7 +55,7 @@ class AudioProcessor(TranscriptionBaseProcessor):
compressed_segments = AudioSegment.empty()
for i in range(total_chunks):
self._log_tuning("_compress_audio", {
self.log_tuning("_compress_audio", {
"Segment Nr": f"{i + 1} of {total_chunks}"
})
@@ -87,7 +87,7 @@ class AudioProcessor(TranscriptionBaseProcessor):
compressed_filename,
compressed_buffer.read()
)
self._log_tuning("_compress_audio", {
self.log_tuning("_compress_audio", {
"Compressed audio to MinIO": compressed_filename
})
@@ -172,14 +172,14 @@ class AudioProcessor(TranscriptionBaseProcessor):
transcriptions.append(trans)
self._log_tuning("_transcribe_audio", {
self.log_tuning("_transcribe_audio", {
"Chunk Nr": f"{i + 1} of {total_chunks}",
"Segment Duration": segment_duration,
"Transcription": trans,
})
else:
self._log("Warning: Received empty transcription", level='warning')
self._log_tuning("_transcribe_audio", {"ERROR": "No transcription"})
self.log_tuning("_transcribe_audio", {"ERROR": "No transcription"})
except Exception as e:
self._log(f"Error during transcription: {str(e)}", level='error')
@@ -202,7 +202,7 @@ class AudioProcessor(TranscriptionBaseProcessor):
transcription_filename,
full_transcription.encode('utf-8')
)
self._log_tuning(f"Saved transcription to MinIO: {transcription_filename}")
self.log_tuning(f"Saved transcription to MinIO: {transcription_filename}")
return full_transcription

View File

@@ -17,7 +17,7 @@ class BaseProcessor(ABC):
self.tuning_logger = None
self._setup_tuning_logger()
self._log_tuning("Processor initialized", {
self.log_tuning("Processor initialized", {
"processor_type": processor.type if processor else None,
"document_version": document_version.id if document_version else None,
"catalog": catalog.id if catalog else None
@@ -42,6 +42,10 @@ class BaseProcessor(ABC):
def process(self):
pass
@property
def configuration(self):
return self.processor.configuration
def _save_markdown(self, markdown):
markdown_filename = f"{self.document_version.id}.md"
minio_client.upload_document_file(
@@ -78,7 +82,7 @@ class BaseProcessor(ABC):
return markdown
def _log_tuning(self, message: str, data: Dict[str, Any] = None) -> None:
def log_tuning(self, message: str, data: Dict[str, Any] = None) -> None:
if self.tuning and self.tuning_logger:
try:
self.tuning_logger.log_tuning('processor', message, data)

View File

@@ -0,0 +1,129 @@
import docx
import io
from .base_processor import BaseProcessor
from .processor_registry import ProcessorRegistry
from common.extensions import minio_client
import re
class DocxProcessor(BaseProcessor):
def __init__(self, tenant, model_variables, document_version, catalog, processor):
super().__init__(tenant, model_variables, document_version, catalog, processor)
self.config = processor.configuration
self.extract_comments = self.config.get('extract_comments', False)
self.extract_headers_footers = self.config.get('extract_headers_footers', False)
self.preserve_formatting = self.config.get('preserve_formatting', True)
self.list_style = self.config.get('list_style', 'dash')
self.image_handling = self.config.get('image_handling', 'skip')
self.table_alignment = self.config.get('table_alignment', 'left')
def process(self):
try:
file_data = minio_client.download_document_file(
self.tenant.id,
self.document_version.bucket_name,
self.document_version.object_name,
)
doc = docx.Document(io.BytesIO(file_data))
markdown = self._convert_to_markdown(doc)
title = self._extract_title(doc)
self._save_markdown(markdown)
return markdown, title
except Exception as e:
self._log(f"Error processing DOCX: {str(e)}", level='error')
raise
def _convert_to_markdown(self, doc):
markdown_parts = []
if self.extract_headers_footers:
for section in doc.sections:
if section.header.paragraphs:
markdown_parts.extend(self._process_paragraphs(section.header.paragraphs))
markdown_parts.extend(self._process_paragraphs(doc.paragraphs))
if self.extract_comments and doc.comments:
markdown_parts.append("\n## Comments\n")
for comment in doc.comments:
markdown_parts.append(f"> {comment.text}\n")
return "\n".join(markdown_parts)
def _process_paragraphs(self, paragraphs):
markdown_parts = []
in_list = False
for para in paragraphs:
if not para.text.strip():
continue
style = para.style.name.lower()
if 'heading' in style:
level = int(style[-1]) if style[-1].isdigit() else 1
markdown_parts.append(f"{'#' * level} {para.text}\n")
elif para._p.pPr and para._p.pPr.numPr: # List item
marker = self._get_list_marker()
markdown_parts.append(f"{marker} {para.text}\n")
in_list = True
else:
if in_list:
markdown_parts.append("\n")
in_list = False
text = para.text
if self.preserve_formatting:
text = self._apply_formatting(para)
markdown_parts.append(f"{text}\n")
return markdown_parts
def _get_list_marker(self):
return {
'dash': '-',
'asterisk': '*',
'plus': '+'
}.get(self.list_style, '-')
def _apply_formatting(self, paragraph):
text = paragraph.text
if not text:
return ""
runs = paragraph.runs
formatted_parts = []
for run in runs:
part = run.text
if run.bold:
part = f"**{part}**"
if run.italic:
part = f"*{part}*"
if run.underline:
part = f"__{part}__"
formatted_parts.append(part)
return "".join(formatted_parts)
def _extract_title(self, doc):
if doc.paragraphs:
first_para = doc.paragraphs[0]
if 'heading' in first_para.style.name.lower():
return first_para.text.strip()
# Look for first Heading 1 in document
for para in doc.paragraphs:
if para.style.name.lower() == 'heading 1':
return para.text.strip()
return "Untitled Document"
ProcessorRegistry.register("DOCX_PROCESSOR", DocxProcessor)

View File

@@ -24,7 +24,7 @@ class HTMLProcessor(BaseProcessor):
# Add verification logging
self._log(f"HTML Processor initialized with tuning={self.tuning}")
if self.tuning:
self._log_tuning("HTML Processor initialized", {
self.log_tuning("HTML Processor initialized", {
"html_tags": self.html_tags,
"html_end_tags": self.html_end_tags,
"included_elements": self.html_included_elements,
@@ -75,7 +75,7 @@ class HTMLProcessor(BaseProcessor):
title = soup.find('title').get_text(strip=True) if soup.find('title') else ''
self._log(f'Finished parsing HTML for tenant {self.tenant.id}')
self._log_tuning("_parse_html", {"extracted_html": extracted_html, "title": title})
self.log_tuning("_parse_html", {"extracted_html": extracted_html, "title": title})
return extracted_html, title
def _generate_markdown_from_html(self, html_content):
@@ -96,7 +96,7 @@ class HTMLProcessor(BaseProcessor):
input_html = {"html": chunk}
markdown_chunk = chain.invoke(input_html)
markdown_chunks.append(markdown_chunk)
self._log_tuning("_generate_markdown_from_html", {"chunk": chunk, "markdown_chunk": markdown_chunk})
self.log_tuning("_generate_markdown_from_html", {"chunk": chunk, "markdown_chunk": markdown_chunk})
markdown = "\n\n".join(markdown_chunks)
self._log(f'Finished generating markdown from HTML for tenant {self.tenant.id}')

View File

@@ -0,0 +1,48 @@
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
import re
from langchain_core.runnables import RunnablePassthrough
from common.extensions import minio_client
from common.utils.model_utils import create_language_template
from .base_processor import BaseProcessor
from common.utils.business_event_context import current_event
from .processor_registry import ProcessorRegistry
def _find_first_h1(markdown: str) -> str:
# Look for # Header (allowing spaces after #)
match = re.search(r'^#\s+(.+)$', markdown, re.MULTILINE)
return match.group(1).strip() if match else ""
class MarkdownProcessor(BaseProcessor):
def __init__(self, tenant, model_variables, document_version, catalog, processor):
super().__init__(tenant, model_variables, document_version, catalog, processor)
self.chunk_size = catalog.max_chunk_size
self.chunk_overlap = 0
self.tuning = self.processor.tuning
def process(self):
self._log("Starting Markdown processing")
try:
file_data = minio_client.download_document_file(
self.tenant.id,
self.document_version.bucket_name,
self.document_version.object_name,
)
markdown = file_data.decode('utf-8')
title = _find_first_h1(markdown)
self._save_markdown(markdown)
self._log("Finished processing Markdown")
return markdown, title
except Exception as e:
self._log(f"Error processing Markdown: {str(e)}", level='error')
raise
ProcessorRegistry.register("MARKDOWN_PROCESSOR", MarkdownProcessor)

View File

@@ -57,7 +57,7 @@ class PDFProcessor(BaseProcessor):
'figures': self._extract_figures(page, page_num, figure_counter),
'tables': self._extract_tables(page)
}
self._log_tuning("_extract_content", {"page_num": page_num, "page_content": page_content})
self.log_tuning("_extract_content", {"page_num": page_num, "page_content": page_content})
figure_counter += len(page_content['figures'])
extracted_content.append(page_content)
@@ -119,7 +119,7 @@ class PDFProcessor(BaseProcessor):
markdown_table = self._table_to_markdown(table)
if markdown_table: # Only add non-empty tables
tables.append(markdown_table)
self._log_tuning("_extract_tables", {"markdown_table": markdown_table})
self.log_tuning("_extract_tables", {"markdown_table": markdown_table})
except Exception as e:
self._log(f"Error extracting tables from page: {str(e)}", level='error')
return tables

View File

@@ -45,7 +45,7 @@ class TranscriptionBaseProcessor(BaseProcessor):
return text_splitter.split_text(transcription)
def _process_chunks(self, chunks):
self._log_tuning("_process_chunks", {"Nr of Chunks": len(chunks)})
self.log_tuning("_process_chunks", {"Nr of Chunks": len(chunks)})
llm = self.model_variables.get_llm()
template = self.model_variables.get_template('transcript')
language_template = create_language_template(template, self.document_version.language)
@@ -64,7 +64,7 @@ class TranscriptionBaseProcessor(BaseProcessor):
}
markdown = chain.invoke(input_transcript)
markdown = self._clean_markdown(markdown)
self._log_tuning("_process_chunks", {
self.log_tuning("_process_chunks", {
"Chunk Number": f"{i + 1} of {len(chunks)}",
"Chunk": chunk,
"Previous Chunk": previous_part,

View File

@@ -1,3 +1,4 @@
import re
from datetime import datetime as dt, timezone as tz
from celery import states
@@ -23,6 +24,8 @@ from common.utils.business_event_context import current_event
from config.type_defs.processor_types import PROCESSOR_TYPES
from eveai_workers.processors.processor_registry import ProcessorRegistry
from common.utils.config_field_types import json_to_pattern_list
# Healthcheck task
@current_celery.task(name='ping', queue='embeddings')
@@ -99,9 +102,13 @@ def create_embeddings(tenant_id, document_version_id):
processor=processor
)
markdown, title = document_processor.process()
document_processor.log_tuning("Processor returned: ", {
'markdown': markdown,
'title': title
})
with current_event.create_span("Embedding"):
embed_markdown(tenant, model_variables, document_version, catalog, markdown, title)
embed_markdown(tenant, model_variables, document_version, catalog, document_processor, markdown, title)
current_event.log("Finished Embedding Creation Task")
@@ -129,16 +136,19 @@ def delete_embeddings_for_document_version(document_version):
raise
def embed_markdown(tenant, model_variables, document_version, catalog, markdown, title):
def embed_markdown(tenant, model_variables, document_version, catalog, processor, markdown, title):
# Create potential chunks
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, f"{document_version.id}.md")
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, processor, markdown)
processor.log_tuning("Potential Chunks: ", {'potential chunks': potential_chunks})
# Combine chunks for embedding
chunks = combine_chunks_for_markdown(potential_chunks, catalog.min_chunk_size, catalog.max_chunk_size)
chunks = combine_chunks_for_markdown(potential_chunks, catalog.min_chunk_size, catalog.max_chunk_size, processor)
processor.log_tuning("Chunks: ", {'chunks': chunks})
# Enrich chunks
with current_event.create_span("Enrich Chunks"):
enriched_chunks = enrich_chunks(tenant, model_variables, document_version, title, chunks)
processor.log_tuning("Enriched Chunks: ", {'enriched_chunks': enriched_chunks})
# Create embeddings
with current_event.create_span("Create Embeddings"):
@@ -238,23 +248,17 @@ def embed_chunks(tenant, model_variables, document_version, chunks):
return new_embeddings
def create_potential_chunks_for_markdown(tenant_id, document_version, input_file):
def create_potential_chunks_for_markdown(tenant_id, document_version, processor, markdown):
try:
current_app.logger.info(f'Creating potential chunks for tenant {tenant_id}')
markdown_on = document_version.object_name.rsplit('.', 1)[0] + '.md'
# Download the markdown file from MinIO
markdown_data = minio_client.download_document_file(tenant_id,
document_version.bucket_name,
markdown_on,
)
markdown = markdown_data.decode('utf-8')
heading_level = processor.configuration.get('chunking_heading_level', 2)
headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
(f"{'#' * i}", f"Header {i}") for i in range(1, min(heading_level + 1, 7))
]
processor.log_tuning('Headers to split on', {'header list: ': headers_to_split_on})
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
md_header_splits = markdown_splitter.split_text(markdown)
potential_chunks = [doc.page_content for doc in md_header_splits]
@@ -265,14 +269,61 @@ def create_potential_chunks_for_markdown(tenant_id, document_version, input_file
raise
def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):
def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars, processor):
actual_chunks = []
current_chunk = ""
current_length = 0
def matches_chunking_pattern(text, patterns):
if not patterns:
return False
# Get the first line of the text
first_line = text.split('\n', 1)[0].strip()
# Check if it's a header at appropriate level
header_match = re.match(r'^(#{1,6})\s+(.+)$', first_line)
if not header_match:
return False
# Get the heading level (number of #s)
header_level = len(header_match.group(1))
# Get the header text
header_text = header_match.group(2)
# Check if header matches any pattern
for pattern in patterns:
try:
processor.log_tuning('Pattern check: ', {
'pattern: ': pattern,
'text': header_text
})
if re.search(pattern, header_text, re.IGNORECASE):
return True
except Exception as e:
current_app.logger.warning(f"Invalid regex pattern '{pattern}': {str(e)}")
continue
return False
chunking_patterns = json_to_pattern_list(processor.configuration.get('chunking_patterns', []))
processor.log_tuning(f'Chunking Patterns Extraction: ', {
'Full Configuration': processor.configuration,
'Chunking Patterns': chunking_patterns,
})
for chunk in potential_chunks:
chunk_length = len(chunk)
# Force new chunk if pattern matches
if chunking_patterns and matches_chunking_pattern(chunk, chunking_patterns):
if current_chunk and current_length >= min_chars:
actual_chunks.append(current_chunk)
current_chunk = chunk
current_length = chunk_length
continue
if current_length + chunk_length > max_chars:
if current_length >= min_chars:
actual_chunks.append(current_chunk)

View File

@@ -89,3 +89,4 @@ prometheus_flask_exporter~=0.23.1
prometheus_client~=0.20.0
babel~=2.16.0
dogpile.cache~=1.3.3
python-docx~=1.1.2