- Addition of general chunking parameters chunking_heading_level and chunking patterns

- Addition of Processor types docx and markdown
This commit is contained in:
Josako
2024-12-05 15:19:37 +01:00
parent 311927d5ea
commit d35ec9f5ae
17 changed files with 718 additions and 66 deletions

View File

@@ -64,6 +64,20 @@ class TaggingFields(BaseModel):
} }
class ChunkingPatternsField(BaseModel):
"""Represents a set of chunking patterns"""
patterns: List[str]
@field_validator('patterns')
def validate_patterns(cls, patterns):
for pattern in patterns:
try:
re.compile(pattern)
except re.error as e:
raise ValueError(f"Invalid regex pattern '{pattern}': {str(e)}")
return patterns
class ArgumentConstraint(BaseModel): class ArgumentConstraint(BaseModel):
"""Base class for all argument constraints""" """Base class for all argument constraints"""
description: Optional[str] = None description: Optional[str] = None
@@ -611,3 +625,38 @@ def _generate_yaml_docs(fields: Dict[str, Any], version: str) -> str:
} }
return yaml.dump(doc, sort_keys=False, default_flow_style=False) return yaml.dump(doc, sort_keys=False, default_flow_style=False)
def patterns_to_json(text_area_content: str) -> str:
"""Convert line-based patterns to JSON"""
text_area_content = text_area_content.strip()
if len(text_area_content) == 0:
return json.dumps([])
# Split on newlines and remove empty lines
patterns = [line.strip() for line in text_area_content.split('\n') if line.strip()]
return json.dumps(patterns)
def json_to_patterns(json_content: str) -> str:
"""Convert JSON patterns list to text area content"""
try:
patterns = json.loads(json_content)
if not isinstance(patterns, list):
raise ValueError("JSON must contain a list of patterns")
# Join with newlines
return '\n'.join(patterns)
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON format: {e}")
def json_to_pattern_list(json_content: str) -> list:
"""Convert JSON patterns list to text area content"""
try:
patterns = json.loads(json_content)
if not isinstance(patterns, list):
raise ValueError("JSON must contain a list of patterns")
# Unescape if needed
patterns = [pattern.replace('\\\\', '\\') for pattern in patterns]
return patterns
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON format: {e}")

View File

@@ -12,7 +12,7 @@ import requests
from urllib.parse import urlparse, unquote, urlunparse from urllib.parse import urlparse, unquote, urlunparse
import os import os
from .eveai_exceptions import (EveAIInvalidLanguageException, EveAIDoubleURLException, EveAIUnsupportedFileType, from .eveai_exceptions import (EveAIInvalidLanguageException, EveAIDoubleURLException, EveAIUnsupportedFileType,
EveAIInvalidCatalog, EveAIInvalidDocument, EveAIInvalidDocumentVersion) EveAIInvalidCatalog, EveAIInvalidDocument, EveAIInvalidDocumentVersion, EveAIException)
from ..models.user import Tenant from ..models.user import Tenant
@@ -219,12 +219,6 @@ def start_embedding_task(tenant_id, doc_vers_id):
return task.id return task.id
def validate_file_type(extension):
if extension not in current_app.config['SUPPORTED_FILE_TYPES']:
raise EveAIUnsupportedFileType(f"Filetype {extension} is currently not supported. "
f"Supported filetypes: {', '.join(current_app.config['SUPPORTED_FILE_TYPES'])}")
def get_filename_from_url(url): def get_filename_from_url(url):
parsed_url = urlparse(url) parsed_url = urlparse(url)
path_parts = parsed_url.path.split('/') path_parts = parsed_url.path.split('/')
@@ -363,3 +357,109 @@ def cope_with_local_url(url):
return url return url
def lookup_document(tenant_id: int, lookup_criteria: dict, metadata_type: str) -> tuple[Document, DocumentVersion]:
"""
Look up a document using metadata criteria
Args:
tenant_id: ID of the tenant
lookup_criteria: Dictionary of key-value pairs to match in metadata
metadata_type: Which metadata to search in ('user_metadata' or 'system_metadata')
Returns:
Tuple of (Document, DocumentVersion) if found
Raises:
ValueError: If invalid metadata_type provided
EveAIException: If lookup fails
"""
if metadata_type not in ['user_metadata', 'system_metadata']:
raise ValueError(f"Invalid metadata_type: {metadata_type}")
try:
# Query for the latest document version matching the criteria
query = (db.session.query(Document, DocumentVersion)
.join(DocumentVersion)
.filter(Document.id == DocumentVersion.doc_id)
.order_by(DocumentVersion.id.desc()))
# Add metadata filtering using PostgreSQL JSONB operators
metadata_field = getattr(DocumentVersion, metadata_type)
for key, value in lookup_criteria.items():
query = query.filter(metadata_field[key].astext == str(value))
# Get first result
result = query.first()
if not result:
raise EveAIException(
f"No document found matching criteria in {metadata_type}",
status_code=404
)
return result
except SQLAlchemyError as e:
current_app.logger.error(f'Database error during document lookup for tenant {tenant_id}: {e}')
raise EveAIException(
"Database error during document lookup",
status_code=500
)
except Exception as e:
current_app.logger.error(f'Error during document lookup for tenant {tenant_id}: {e}')
raise EveAIException(
"Error during document lookup",
status_code=500
)
# Add to common/utils/document_utils.py
def refresh_document_with_content(doc_id: int, tenant_id: int, file_content: bytes, api_input: dict) -> tuple:
"""
Refresh document with new content
Args:
doc_id: Document ID
tenant_id: Tenant ID
file_content: New file content
api_input: Additional document information
Returns:
Tuple of (new_version, task_id)
"""
doc = Document.query.get(doc_id)
if not doc:
raise EveAIInvalidDocument(tenant_id, doc_id)
old_doc_vers = DocumentVersion.query.filter_by(doc_id=doc_id).order_by(desc(DocumentVersion.id)).first()
# Create new version with same file type as original
extension = old_doc_vers.file_type
new_doc_vers = create_version_for_document(
doc, tenant_id,
'', # No URL for content-based updates
old_doc_vers.sub_file_type,
api_input.get('language', old_doc_vers.language),
api_input.get('user_context', old_doc_vers.user_context),
api_input.get('user_metadata', old_doc_vers.user_metadata),
api_input.get('catalog_properties', old_doc_vers.catalog_properties),
)
try:
db.session.add(new_doc_vers)
db.session.commit()
except SQLAlchemyError as e:
db.session.rollback()
return None, str(e)
# Upload new content
upload_file_for_version(new_doc_vers, file_content, extension, tenant_id)
# Start embedding task
task = current_celery.send_task('create_embeddings', args=[tenant_id, new_doc_vers.id], queue='embeddings')
current_app.logger.info(f'Embedding creation started for document {doc_id} on version {new_doc_vers.id} '
f'with task id: {task.id}.')
return new_doc_vers, task.id

View File

@@ -55,7 +55,6 @@ class Config(object):
# file upload settings # file upload settings
MAX_CONTENT_LENGTH = 50 * 1024 * 1024 MAX_CONTENT_LENGTH = 50 * 1024 * 1024
UPLOAD_EXTENSIONS = ['.txt', '.pdf', '.png', '.jpg', '.jpeg', '.gif']
# supported languages # supported languages
SUPPORTED_LANGUAGES = ['en', 'fr', 'nl', 'de', 'es'] SUPPORTED_LANGUAGES = ['en', 'fr', 'nl', 'de', 'es']
@@ -143,10 +142,7 @@ class Config(object):
LANGCHAIN_ENDPOINT = 'https://api.smith.langchain.com' LANGCHAIN_ENDPOINT = 'https://api.smith.langchain.com'
LANGCHAIN_PROJECT = "eveai" LANGCHAIN_PROJECT = "eveai"
TENANT_TYPES = ['Active', 'Demo', 'Inactive', 'Test']
SUPPORTED_FILE_TYPES = ['pdf', 'html', 'md', 'txt', 'mp3', 'mp4', 'ogg', 'srt']
TENANT_TYPES = ['Active', 'Demo', 'Inactive', 'Test', 'Wordpress Starter']
# The maximum number of seconds allowed for audio compression (to save resources) # The maximum number of seconds allowed for audio compression (to save resources)
MAX_COMPRESSION_DURATION = 60*10 # 10 minutes MAX_COMPRESSION_DURATION = 60*10 # 10 minutes

View File

@@ -5,6 +5,19 @@ PROCESSOR_TYPES = {
"file_types": "html", "file_types": "html",
"Description": "A processor for HTML files", "Description": "A processor for HTML files",
"configuration": { "configuration": {
"chunking_patterns": {
"name": "Chunking Patterns",
"description": "A list of Patterns used to chunk files into logical pieces",
"type": "chunking_patterns",
"required": False
},
"chunking_heading_level": {
"name": "Chunking Heading Level",
"type": "integer",
"description": "Maximum heading level to consider for chunking (1-6)",
"required": False,
"default": 2
},
"html_tags": { "html_tags": {
"name": "HTML Tags", "name": "HTML Tags",
"type": "string", "type": "string",
@@ -45,7 +58,21 @@ PROCESSOR_TYPES = {
"name": "PDF Processor", "name": "PDF Processor",
"file_types": "pdf", "file_types": "pdf",
"Description": "A Processor for PDF files", "Description": "A Processor for PDF files",
"configuration": {} "configuration": {
"chunking_patterns": {
"name": "Chunking Patterns",
"description": "A list of Patterns used to chunk files into logical pieces",
"type": "chunking_patterns",
"required": False
},
"chunking_heading_level": {
"name": "Chunking Heading Level",
"type": "integer",
"description": "Maximum heading level to consider for chunking (1-6)",
"required": False,
"default": 2
},
},
}, },
"AUDIO_PROCESSOR": { "AUDIO_PROCESSOR": {
"name": "AUDIO Processor", "name": "AUDIO Processor",
@@ -53,4 +80,89 @@ PROCESSOR_TYPES = {
"Description": "A Processor for audio files", "Description": "A Processor for audio files",
"configuration": {} "configuration": {}
}, },
"MARKDOWN_PROCESSOR": {
"name": "Markdown Processor",
"file_types": "md",
"Description": "A Processor for markdown files",
"configuration": {
"chunking_patterns": {
"name": "Chunking Patterns",
"description": "A list of Patterns used to chunk files into logical pieces",
"type": "chunking_patterns",
"required": False
},
"chunking_heading_level": {
"name": "Chunking Heading Level",
"type": "integer",
"description": "Maximum heading level to consider for chunking (1-6)",
"required": False,
"default": 2
},
}
},
"DOCX_PROCESSOR": {
"name": "DOCX Processor",
"file_types": "docx",
"Description": "A processor for DOCX files",
"configuration": {
"chunking_patterns": {
"name": "Chunking Patterns",
"description": "A list of Patterns used to chunk files into logical pieces",
"type": "chunking_patterns",
"required": False
},
"chunking_heading_level": {
"name": "Chunking Heading Level",
"type": "integer",
"description": "Maximum heading level to consider for chunking (1-6)",
"required": False,
"default": 2
},
"extract_comments": {
"name": "Extract Comments",
"type": "boolean",
"description": "Whether to include document comments in the markdown",
"required": False,
"default": False
},
"extract_headers_footers": {
"name": "Extract Headers/Footers",
"type": "boolean",
"description": "Whether to include headers and footers in the markdown",
"required": False,
"default": False
},
"preserve_formatting": {
"name": "Preserve Formatting",
"type": "boolean",
"description": "Whether to preserve bold, italic, and other text formatting",
"required": False,
"default": True
},
"list_style": {
"name": "List Style",
"type": "enum",
"description": "How to format lists in markdown",
"required": False,
"default": "dash",
"allowed_values": ["dash", "asterisk", "plus"]
},
"image_handling": {
"name": "Image Handling",
"type": "enum",
"description": "How to handle embedded images",
"required": False,
"default": "skip",
"allowed_values": ["skip", "extract", "placeholder"]
},
"table_alignment": {
"name": "Table Alignment",
"type": "enum",
"description": "How to align table contents",
"required": False,
"default": "left",
"allowed_values": ["left", "center", "preserve"]
}
}
}
} }

View File

@@ -11,9 +11,10 @@ from common.utils.document_utils import (
create_document_stack, process_url, start_embedding_task, create_document_stack, process_url, start_embedding_task,
validate_file_type, EveAIInvalidLanguageException, EveAIDoubleURLException, EveAIUnsupportedFileType, validate_file_type, EveAIInvalidLanguageException, EveAIDoubleURLException, EveAIUnsupportedFileType,
get_documents_list, edit_document, refresh_document, edit_document_version, get_documents_list, edit_document, refresh_document, edit_document_version,
refresh_document_with_info refresh_document_with_info, lookup_document
) )
from common.utils.eveai_exceptions import EveAIException from common.utils.eveai_exceptions import EveAIException
from eveai_api.api.auth import requires_service
def validate_date(date_str): def validate_date(date_str):
@@ -59,6 +60,7 @@ add_document_response = document_ns.model('AddDocumentResponse', {
@document_ns.route('/add_document') @document_ns.route('/add_document')
class AddDocument(Resource): class AddDocument(Resource):
@jwt_required() @jwt_required()
@requires_service('DOCAPI')
@document_ns.expect(upload_parser) @document_ns.expect(upload_parser)
@document_ns.response(201, 'Document added successfully', add_document_response) @document_ns.response(201, 'Document added successfully', add_document_response)
@document_ns.response(400, 'Validation Error') @document_ns.response(400, 'Validation Error')
@@ -134,6 +136,7 @@ add_url_response = document_ns.model('AddURLResponse', {
@document_ns.route('/add_url') @document_ns.route('/add_url')
class AddURL(Resource): class AddURL(Resource):
@jwt_required() @jwt_required()
@requires_service('DOCAPI')
@document_ns.expect(add_url_model) @document_ns.expect(add_url_model)
@document_ns.response(201, 'Document added successfully', add_url_response) @document_ns.response(201, 'Document added successfully', add_url_response)
@document_ns.response(400, 'Validation Error') @document_ns.response(400, 'Validation Error')
@@ -190,6 +193,7 @@ document_list_model = document_ns.model('DocumentList', {
@document_ns.route('/list') @document_ns.route('/list')
class DocumentList(Resource): class DocumentList(Resource):
@jwt_required() @jwt_required()
@requires_service('DOCAPI')
@document_ns.doc('list_documents') @document_ns.doc('list_documents')
@document_ns.marshal_list_with(document_list_model, envelope='documents') @document_ns.marshal_list_with(document_list_model, envelope='documents')
def get(self): def get(self):
@@ -210,6 +214,7 @@ edit_document_model = document_ns.model('EditDocument', {
@document_ns.route('/<int:document_id>') @document_ns.route('/<int:document_id>')
class DocumentResource(Resource): class DocumentResource(Resource):
@jwt_required() @jwt_required()
@requires_service('DOCAPI')
@document_ns.doc('edit_document') @document_ns.doc('edit_document')
@document_ns.expect(edit_document_model) @document_ns.expect(edit_document_model)
@document_ns.response(200, 'Document updated successfully') @document_ns.response(200, 'Document updated successfully')
@@ -232,6 +237,7 @@ class DocumentResource(Resource):
return e.to_dict(), e.status_code return e.to_dict(), e.status_code
@jwt_required() @jwt_required()
@requires_service('DOCAPI')
@document_ns.doc('refresh_document') @document_ns.doc('refresh_document')
@document_ns.response(200, 'Document refreshed successfully') @document_ns.response(200, 'Document refreshed successfully')
def post(self, document_id): def post(self, document_id):
@@ -253,6 +259,7 @@ edit_document_version_model = document_ns.model('EditDocumentVersion', {
@document_ns.route('/version/<int:version_id>') @document_ns.route('/version/<int:version_id>')
class DocumentVersionResource(Resource): class DocumentVersionResource(Resource):
@jwt_required() @jwt_required()
@requires_service('DOCAPI')
@document_ns.doc('edit_document_version') @document_ns.doc('edit_document_version')
@document_ns.expect(edit_document_version_model) @document_ns.expect(edit_document_version_model)
@document_ns.response(200, 'Document version updated successfully') @document_ns.response(200, 'Document version updated successfully')
@@ -280,6 +287,7 @@ refresh_document_model = document_ns.model('RefreshDocument', {
@document_ns.route('/<int:document_id>/refresh') @document_ns.route('/<int:document_id>/refresh')
class RefreshDocument(Resource): class RefreshDocument(Resource):
@jwt_required() @jwt_required()
@requires_service('DOCAPI')
@document_ns.response(200, 'Document refreshed successfully') @document_ns.response(200, 'Document refreshed successfully')
@document_ns.response(404, 'Document not found') @document_ns.response(404, 'Document not found')
def post(self, document_id): def post(self, document_id):
@@ -310,6 +318,7 @@ class RefreshDocument(Resource):
@document_ns.route('/<int:document_id>/refresh_with_info') @document_ns.route('/<int:document_id>/refresh_with_info')
class RefreshDocumentWithInfo(Resource): class RefreshDocumentWithInfo(Resource):
@jwt_required() @jwt_required()
@requires_service('DOCAPI')
@document_ns.expect(refresh_document_model) @document_ns.expect(refresh_document_model)
@document_ns.response(200, 'Document refreshed successfully') @document_ns.response(200, 'Document refreshed successfully')
@document_ns.response(400, 'Validation Error') @document_ns.response(400, 'Validation Error')
@@ -338,3 +347,112 @@ class RefreshDocumentWithInfo(Resource):
except Exception as e: except Exception as e:
current_app.logger.error(f'Error refreshing document with info: {str(e)}') current_app.logger.error(f'Error refreshing document with info: {str(e)}')
return {'message': 'Internal server error'}, 500 return {'message': 'Internal server error'}, 500
# Define models for lookup requests
lookup_model = document_ns.model('DocumentLookup', {
'lookup_criteria': fields.Raw(required=True,
description='JSON object containing key-value pairs to match in metadata. '
'Example: {"external_id": "123", "source": "zapier", "source_type": "google_docs"}'),
'metadata_type': fields.String(required=True, enum=['user_metadata', 'system_metadata'],
description='Which metadata field to search in')
})
lookup_response = document_ns.model('DocumentLookupResponse', {
'document_id': fields.Integer(description='ID of the found document'),
'document_version_id': fields.Integer(description='ID of the latest document version'),
'name': fields.String(description='Document name'),
'metadata': fields.Raw(description='Full metadata of the found document')
})
@document_ns.route('/lookup')
class DocumentLookup(Resource):
@jwt_required()
@requires_service('DOCAPI')
@document_ns.expect(lookup_model)
@document_ns.marshal_with(lookup_response)
@document_ns.response(200, 'Document found', lookup_response)
@document_ns.response(404, 'No document found matching criteria')
def post(self):
"""
Look up a document using metadata criteria
"""
tenant_id = get_jwt_identity()
try:
data = request.json
document, version = lookup_document(
tenant_id,
data['lookup_criteria'],
data['metadata_type']
)
return {
'document_id': document.id,
'document_version_id': version.id,
'name': document.name,
'metadata': getattr(version, data['metadata_type'])
}
except EveAIException as e:
return e.to_dict(), e.status_code
except KeyError as e:
return {'message': f'Missing required field: {str(e)}'}, 400
refresh_content_model = document_ns.model('RefreshDocumentContent', {
'file_content': fields.Raw(required=True, description='The new file content'),
'language': fields.String(required=False, description='Language of the document'),
'user_context': fields.String(required=False, description='User context for the document'),
'user_metadata': fields.Raw(required=False, description='Custom metadata fields'),
'catalog_properties': fields.Raw(required=False, description='Catalog-specific properties'),
'trigger_service': fields.String(required=False, description='Service that triggered the update')
})
@document_ns.route('/<int:document_id>/refresh_content')
class RefreshDocumentContent(Resource):
@jwt_required()
@requires_service('DOCAPI')
@document_ns.expect(refresh_content_model)
@document_ns.response(200, 'Document refreshed successfully')
def post(self, document_id):
"""Refresh a document with new content"""
tenant_id = get_jwt_identity()
try:
data = request.json
file_content = data['file_content']
# Build user_metadata by merging:
# 1. Existing metadata (if any)
# 2. New metadata from request
# 3. Zapier-specific fields
user_metadata = data.get('user_metadata', {})
user_metadata.update({
'source': 'zapier',
'trigger_service': data.get('trigger_service')
})
data['user_metadata'] = user_metadata
# Keep catalog_properties separate
if 'catalog_properties' in data:
# We could add validation here against catalog configuration
data['catalog_properties'] = data['catalog_properties']
new_version, task_id = refresh_document_with_content(
document_id,
tenant_id,
file_content,
data
)
return {
'message': f'Document refreshed successfully. New version: {new_version.id}. Task ID: {task_id}',
'document_id': document_id,
'document_version_id': new_version.id,
'task_id': task_id
}, 200
except EveAIException as e:
return e.to_dict(), e.status_code

View File

@@ -15,14 +15,6 @@ from config.type_defs.retriever_types import RETRIEVER_TYPES
from .dynamic_form_base import DynamicFormBase from .dynamic_form_base import DynamicFormBase
def allowed_file(form, field):
if field.data:
filename = field.data.filename
allowed_extensions = current_app.config.get('SUPPORTED_FILE_TYPES', [])
if not ('.' in filename and filename.rsplit('.', 1)[1].lower() in allowed_extensions):
raise ValidationError('Unsupported file type.')
def validate_json(form, field): def validate_json(form, field):
if field.data: if field.data:
try: try:
@@ -101,7 +93,10 @@ class ProcessorForm(FlaskForm):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
# Dynamically populate the 'type' field using the constructor # Dynamically populate the 'type' field using the constructor
self.type.choices = [(key, value['name']) for key, value in PROCESSOR_TYPES.items()] self.type.choices = sorted(
[(key, value['name']) for key, value in PROCESSOR_TYPES.items()],
key=lambda x: x[1],
)
class EditProcessorForm(DynamicFormBase): class EditProcessorForm(DynamicFormBase):
@@ -177,7 +172,7 @@ class EditRetrieverForm(DynamicFormBase):
class AddDocumentForm(DynamicFormBase): class AddDocumentForm(DynamicFormBase):
file = FileField('File', validators=[FileRequired(), allowed_file]) file = FileField('File', validators=[FileRequired()])
catalog = StringField('Catalog', render_kw={'readonly': True}) catalog = StringField('Catalog', render_kw={'readonly': True})
sub_file_type = StringField('Sub File Type', validators=[Optional(), Length(max=50)]) sub_file_type = StringField('Sub File Type', validators=[Optional(), Length(max=50)])
name = StringField('Name', validators=[Length(max=100)]) name = StringField('Name', validators=[Length(max=100)])

View File

@@ -14,7 +14,7 @@ import json
from common.models.document import Document, DocumentVersion, Catalog, Retriever, Processor from common.models.document import Document, DocumentVersion, Catalog, Retriever, Processor
from common.extensions import db from common.extensions import db
from common.models.interaction import Specialist, SpecialistRetriever from common.models.interaction import Specialist, SpecialistRetriever
from common.utils.document_utils import validate_file_type, create_document_stack, start_embedding_task, process_url, \ from common.utils.document_utils import create_document_stack, start_embedding_task, process_url, \
edit_document, \ edit_document, \
edit_document_version, refresh_document edit_document_version, refresh_document
from common.utils.eveai_exceptions import EveAIInvalidLanguageException, EveAIUnsupportedFileType, \ from common.utils.eveai_exceptions import EveAIInvalidLanguageException, EveAIUnsupportedFileType, \
@@ -391,9 +391,6 @@ def add_document():
sub_file_type = form.sub_file_type.data sub_file_type = form.sub_file_type.data
filename = secure_filename(file.filename) filename = secure_filename(file.filename)
extension = filename.rsplit('.', 1)[1].lower() extension = filename.rsplit('.', 1)[1].lower()
validate_file_type(extension)
catalog_properties = {} catalog_properties = {}
document_version_configurations = CATALOG_TYPES[catalog.type]['document_version_configurations'] document_version_configurations = CATALOG_TYPES[catalog.type]['document_version_configurations']
for config in document_version_configurations: for config in document_version_configurations:

View File

@@ -5,7 +5,46 @@ import json
from wtforms.fields.choices import SelectField from wtforms.fields.choices import SelectField
from wtforms.fields.datetime import DateField from wtforms.fields.datetime import DateField
from common.utils.config_field_types import TaggingFields from common.utils.config_field_types import TaggingFields, json_to_patterns, patterns_to_json
class TaggingFieldsField(TextAreaField):
def __init__(self, *args, **kwargs):
kwargs['render_kw'] = {
'class': 'chunking-patterns-field',
'data-handle-enter': 'true'
}
super().__init__(*args, **kwargs)
# def _value(self):
# if self.data:
# return json.dumps(self.data)
# return ''
#
# def process_formdata(self, valuelist):
# if valuelist and valuelist[0]:
# try:
# self.data = json.loads(valuelist[0])
# except json.JSONDecodeError as e:
# raise ValueError('Not valid JSON content')
class ChunkingPatternsField(TextAreaField):
def __init__(self, *args, **kwargs):
kwargs['render_kw'] = {
'class': 'chunking-patterns-field',
'data-handle-enter': 'true'
}
super().__init__(*args, **kwargs)
# def _value(self):
# if self.data:
# return '\n'.join(self.data)
# return ''
#
# def process_formdata(self, valuelist):
# if valuelist and valuelist[0]:
# self.data = [line.strip() for line in valuelist[0].split('\n') if line.strip()]
class DynamicFormBase(FlaskForm): class DynamicFormBase(FlaskForm):
@@ -80,7 +119,7 @@ class DynamicFormBase(FlaskForm):
# Handle special case for tagging_fields # Handle special case for tagging_fields
if field_type == 'tagging_fields': if field_type == 'tagging_fields':
field_class = TextAreaField field_class = TaggingFieldsField
extra_classes = 'json-editor' extra_classes = 'json-editor'
field_kwargs = {} field_kwargs = {}
elif field_type == 'enum': elif field_type == 'enum':
@@ -89,6 +128,10 @@ class DynamicFormBase(FlaskForm):
choices = [(str(val), str(val)) for val in allowed_values] choices = [(str(val), str(val)) for val in allowed_values]
extra_classes = '' extra_classes = ''
field_kwargs = {'choices': choices} field_kwargs = {'choices': choices}
elif field_type == 'chunking_patterns':
field_class = ChunkingPatternsField
extra_classes = ['monospace-text', 'pattern-input']
field_kwargs = {}
else: else:
extra_classes = '' extra_classes = ''
field_class = { field_class = {
@@ -111,6 +154,12 @@ class DynamicFormBase(FlaskForm):
except (TypeError, ValueError) as e: except (TypeError, ValueError) as e:
current_app.logger.error(f"Error converting initial data to JSON: {e}") current_app.logger.error(f"Error converting initial data to JSON: {e}")
field_data = "{}" field_data = "{}"
elif field_type == 'chunking_patterns':
try:
field_data = json_to_patterns(field_data)
except (TypeError, ValueError) as e:
current_app.logger.error(f"Error converting initial data to a list of patterns: {e}")
field_data = {}
elif default is not None: elif default is not None:
field_data = default field_data = default
@@ -173,12 +222,17 @@ class DynamicFormBase(FlaskForm):
original_field_name = full_field_name[prefix_length:] original_field_name = full_field_name[prefix_length:]
field = getattr(self, full_field_name) field = getattr(self, full_field_name)
# Parse JSON for tagging_fields type # Parse JSON for tagging_fields type
if isinstance(field, TextAreaField) and field.data: if isinstance(field, TaggingFieldsField) and field.data:
try: try:
data[original_field_name] = json.loads(field.data) data[original_field_name] = json.loads(field.data)
except json.JSONDecodeError: except json.JSONDecodeError:
# Validation should catch this, but just in case # Validation should catch this, but just in case
data[original_field_name] = field.data data[original_field_name] = field.data
elif isinstance(field, ChunkingPatternsField):
try:
data[original_field_name] = patterns_to_json(field.data)
except Exception as e:
current_app.logger.error(f"Error converting initial data to patterns: {e}")
else: else:
data[original_field_name] = field.data data[original_field_name] = field.data
return data return data
@@ -230,5 +284,3 @@ def validate_tagging_fields(form, field):
except (TypeError, ValueError) as e: except (TypeError, ValueError) as e:
raise ValidationError(f"Invalid field definition: {str(e)}") raise ValidationError(f"Invalid field definition: {str(e)}")

View File

@@ -46,7 +46,7 @@ class AudioProcessor(TranscriptionBaseProcessor):
try: try:
audio_info = AudioSegment.from_file(temp_file_path, format=self.document_version.file_type) audio_info = AudioSegment.from_file(temp_file_path, format=self.document_version.file_type)
total_duration = len(audio_info) total_duration = len(audio_info)
self._log_tuning("_compress_audio", { self.log_tuning("_compress_audio", {
"Audio Duration (ms)": total_duration, "Audio Duration (ms)": total_duration,
}) })
segment_length = self.max_compression_duration * 1000 # Convert to milliseconds segment_length = self.max_compression_duration * 1000 # Convert to milliseconds
@@ -55,7 +55,7 @@ class AudioProcessor(TranscriptionBaseProcessor):
compressed_segments = AudioSegment.empty() compressed_segments = AudioSegment.empty()
for i in range(total_chunks): for i in range(total_chunks):
self._log_tuning("_compress_audio", { self.log_tuning("_compress_audio", {
"Segment Nr": f"{i + 1} of {total_chunks}" "Segment Nr": f"{i + 1} of {total_chunks}"
}) })
@@ -87,7 +87,7 @@ class AudioProcessor(TranscriptionBaseProcessor):
compressed_filename, compressed_filename,
compressed_buffer.read() compressed_buffer.read()
) )
self._log_tuning("_compress_audio", { self.log_tuning("_compress_audio", {
"Compressed audio to MinIO": compressed_filename "Compressed audio to MinIO": compressed_filename
}) })
@@ -172,14 +172,14 @@ class AudioProcessor(TranscriptionBaseProcessor):
transcriptions.append(trans) transcriptions.append(trans)
self._log_tuning("_transcribe_audio", { self.log_tuning("_transcribe_audio", {
"Chunk Nr": f"{i + 1} of {total_chunks}", "Chunk Nr": f"{i + 1} of {total_chunks}",
"Segment Duration": segment_duration, "Segment Duration": segment_duration,
"Transcription": trans, "Transcription": trans,
}) })
else: else:
self._log("Warning: Received empty transcription", level='warning') self._log("Warning: Received empty transcription", level='warning')
self._log_tuning("_transcribe_audio", {"ERROR": "No transcription"}) self.log_tuning("_transcribe_audio", {"ERROR": "No transcription"})
except Exception as e: except Exception as e:
self._log(f"Error during transcription: {str(e)}", level='error') self._log(f"Error during transcription: {str(e)}", level='error')
@@ -202,7 +202,7 @@ class AudioProcessor(TranscriptionBaseProcessor):
transcription_filename, transcription_filename,
full_transcription.encode('utf-8') full_transcription.encode('utf-8')
) )
self._log_tuning(f"Saved transcription to MinIO: {transcription_filename}") self.log_tuning(f"Saved transcription to MinIO: {transcription_filename}")
return full_transcription return full_transcription

View File

@@ -17,7 +17,7 @@ class BaseProcessor(ABC):
self.tuning_logger = None self.tuning_logger = None
self._setup_tuning_logger() self._setup_tuning_logger()
self._log_tuning("Processor initialized", { self.log_tuning("Processor initialized", {
"processor_type": processor.type if processor else None, "processor_type": processor.type if processor else None,
"document_version": document_version.id if document_version else None, "document_version": document_version.id if document_version else None,
"catalog": catalog.id if catalog else None "catalog": catalog.id if catalog else None
@@ -42,6 +42,10 @@ class BaseProcessor(ABC):
def process(self): def process(self):
pass pass
@property
def configuration(self):
return self.processor.configuration
def _save_markdown(self, markdown): def _save_markdown(self, markdown):
markdown_filename = f"{self.document_version.id}.md" markdown_filename = f"{self.document_version.id}.md"
minio_client.upload_document_file( minio_client.upload_document_file(
@@ -78,7 +82,7 @@ class BaseProcessor(ABC):
return markdown return markdown
def _log_tuning(self, message: str, data: Dict[str, Any] = None) -> None: def log_tuning(self, message: str, data: Dict[str, Any] = None) -> None:
if self.tuning and self.tuning_logger: if self.tuning and self.tuning_logger:
try: try:
self.tuning_logger.log_tuning('processor', message, data) self.tuning_logger.log_tuning('processor', message, data)

View File

@@ -0,0 +1,129 @@
import docx
import io
from .base_processor import BaseProcessor
from .processor_registry import ProcessorRegistry
from common.extensions import minio_client
import re
class DocxProcessor(BaseProcessor):
def __init__(self, tenant, model_variables, document_version, catalog, processor):
super().__init__(tenant, model_variables, document_version, catalog, processor)
self.config = processor.configuration
self.extract_comments = self.config.get('extract_comments', False)
self.extract_headers_footers = self.config.get('extract_headers_footers', False)
self.preserve_formatting = self.config.get('preserve_formatting', True)
self.list_style = self.config.get('list_style', 'dash')
self.image_handling = self.config.get('image_handling', 'skip')
self.table_alignment = self.config.get('table_alignment', 'left')
def process(self):
try:
file_data = minio_client.download_document_file(
self.tenant.id,
self.document_version.bucket_name,
self.document_version.object_name,
)
doc = docx.Document(io.BytesIO(file_data))
markdown = self._convert_to_markdown(doc)
title = self._extract_title(doc)
self._save_markdown(markdown)
return markdown, title
except Exception as e:
self._log(f"Error processing DOCX: {str(e)}", level='error')
raise
def _convert_to_markdown(self, doc):
markdown_parts = []
if self.extract_headers_footers:
for section in doc.sections:
if section.header.paragraphs:
markdown_parts.extend(self._process_paragraphs(section.header.paragraphs))
markdown_parts.extend(self._process_paragraphs(doc.paragraphs))
if self.extract_comments and doc.comments:
markdown_parts.append("\n## Comments\n")
for comment in doc.comments:
markdown_parts.append(f"> {comment.text}\n")
return "\n".join(markdown_parts)
def _process_paragraphs(self, paragraphs):
markdown_parts = []
in_list = False
for para in paragraphs:
if not para.text.strip():
continue
style = para.style.name.lower()
if 'heading' in style:
level = int(style[-1]) if style[-1].isdigit() else 1
markdown_parts.append(f"{'#' * level} {para.text}\n")
elif para._p.pPr and para._p.pPr.numPr: # List item
marker = self._get_list_marker()
markdown_parts.append(f"{marker} {para.text}\n")
in_list = True
else:
if in_list:
markdown_parts.append("\n")
in_list = False
text = para.text
if self.preserve_formatting:
text = self._apply_formatting(para)
markdown_parts.append(f"{text}\n")
return markdown_parts
def _get_list_marker(self):
return {
'dash': '-',
'asterisk': '*',
'plus': '+'
}.get(self.list_style, '-')
def _apply_formatting(self, paragraph):
text = paragraph.text
if not text:
return ""
runs = paragraph.runs
formatted_parts = []
for run in runs:
part = run.text
if run.bold:
part = f"**{part}**"
if run.italic:
part = f"*{part}*"
if run.underline:
part = f"__{part}__"
formatted_parts.append(part)
return "".join(formatted_parts)
def _extract_title(self, doc):
if doc.paragraphs:
first_para = doc.paragraphs[0]
if 'heading' in first_para.style.name.lower():
return first_para.text.strip()
# Look for first Heading 1 in document
for para in doc.paragraphs:
if para.style.name.lower() == 'heading 1':
return para.text.strip()
return "Untitled Document"
ProcessorRegistry.register("DOCX_PROCESSOR", DocxProcessor)

View File

@@ -24,7 +24,7 @@ class HTMLProcessor(BaseProcessor):
# Add verification logging # Add verification logging
self._log(f"HTML Processor initialized with tuning={self.tuning}") self._log(f"HTML Processor initialized with tuning={self.tuning}")
if self.tuning: if self.tuning:
self._log_tuning("HTML Processor initialized", { self.log_tuning("HTML Processor initialized", {
"html_tags": self.html_tags, "html_tags": self.html_tags,
"html_end_tags": self.html_end_tags, "html_end_tags": self.html_end_tags,
"included_elements": self.html_included_elements, "included_elements": self.html_included_elements,
@@ -75,7 +75,7 @@ class HTMLProcessor(BaseProcessor):
title = soup.find('title').get_text(strip=True) if soup.find('title') else '' title = soup.find('title').get_text(strip=True) if soup.find('title') else ''
self._log(f'Finished parsing HTML for tenant {self.tenant.id}') self._log(f'Finished parsing HTML for tenant {self.tenant.id}')
self._log_tuning("_parse_html", {"extracted_html": extracted_html, "title": title}) self.log_tuning("_parse_html", {"extracted_html": extracted_html, "title": title})
return extracted_html, title return extracted_html, title
def _generate_markdown_from_html(self, html_content): def _generate_markdown_from_html(self, html_content):
@@ -96,7 +96,7 @@ class HTMLProcessor(BaseProcessor):
input_html = {"html": chunk} input_html = {"html": chunk}
markdown_chunk = chain.invoke(input_html) markdown_chunk = chain.invoke(input_html)
markdown_chunks.append(markdown_chunk) markdown_chunks.append(markdown_chunk)
self._log_tuning("_generate_markdown_from_html", {"chunk": chunk, "markdown_chunk": markdown_chunk}) self.log_tuning("_generate_markdown_from_html", {"chunk": chunk, "markdown_chunk": markdown_chunk})
markdown = "\n\n".join(markdown_chunks) markdown = "\n\n".join(markdown_chunks)
self._log(f'Finished generating markdown from HTML for tenant {self.tenant.id}') self._log(f'Finished generating markdown from HTML for tenant {self.tenant.id}')

View File

@@ -0,0 +1,48 @@
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
import re
from langchain_core.runnables import RunnablePassthrough
from common.extensions import minio_client
from common.utils.model_utils import create_language_template
from .base_processor import BaseProcessor
from common.utils.business_event_context import current_event
from .processor_registry import ProcessorRegistry
def _find_first_h1(markdown: str) -> str:
# Look for # Header (allowing spaces after #)
match = re.search(r'^#\s+(.+)$', markdown, re.MULTILINE)
return match.group(1).strip() if match else ""
class MarkdownProcessor(BaseProcessor):
def __init__(self, tenant, model_variables, document_version, catalog, processor):
super().__init__(tenant, model_variables, document_version, catalog, processor)
self.chunk_size = catalog.max_chunk_size
self.chunk_overlap = 0
self.tuning = self.processor.tuning
def process(self):
self._log("Starting Markdown processing")
try:
file_data = minio_client.download_document_file(
self.tenant.id,
self.document_version.bucket_name,
self.document_version.object_name,
)
markdown = file_data.decode('utf-8')
title = _find_first_h1(markdown)
self._save_markdown(markdown)
self._log("Finished processing Markdown")
return markdown, title
except Exception as e:
self._log(f"Error processing Markdown: {str(e)}", level='error')
raise
ProcessorRegistry.register("MARKDOWN_PROCESSOR", MarkdownProcessor)

View File

@@ -57,7 +57,7 @@ class PDFProcessor(BaseProcessor):
'figures': self._extract_figures(page, page_num, figure_counter), 'figures': self._extract_figures(page, page_num, figure_counter),
'tables': self._extract_tables(page) 'tables': self._extract_tables(page)
} }
self._log_tuning("_extract_content", {"page_num": page_num, "page_content": page_content}) self.log_tuning("_extract_content", {"page_num": page_num, "page_content": page_content})
figure_counter += len(page_content['figures']) figure_counter += len(page_content['figures'])
extracted_content.append(page_content) extracted_content.append(page_content)
@@ -119,7 +119,7 @@ class PDFProcessor(BaseProcessor):
markdown_table = self._table_to_markdown(table) markdown_table = self._table_to_markdown(table)
if markdown_table: # Only add non-empty tables if markdown_table: # Only add non-empty tables
tables.append(markdown_table) tables.append(markdown_table)
self._log_tuning("_extract_tables", {"markdown_table": markdown_table}) self.log_tuning("_extract_tables", {"markdown_table": markdown_table})
except Exception as e: except Exception as e:
self._log(f"Error extracting tables from page: {str(e)}", level='error') self._log(f"Error extracting tables from page: {str(e)}", level='error')
return tables return tables

View File

@@ -45,7 +45,7 @@ class TranscriptionBaseProcessor(BaseProcessor):
return text_splitter.split_text(transcription) return text_splitter.split_text(transcription)
def _process_chunks(self, chunks): def _process_chunks(self, chunks):
self._log_tuning("_process_chunks", {"Nr of Chunks": len(chunks)}) self.log_tuning("_process_chunks", {"Nr of Chunks": len(chunks)})
llm = self.model_variables.get_llm() llm = self.model_variables.get_llm()
template = self.model_variables.get_template('transcript') template = self.model_variables.get_template('transcript')
language_template = create_language_template(template, self.document_version.language) language_template = create_language_template(template, self.document_version.language)
@@ -64,7 +64,7 @@ class TranscriptionBaseProcessor(BaseProcessor):
} }
markdown = chain.invoke(input_transcript) markdown = chain.invoke(input_transcript)
markdown = self._clean_markdown(markdown) markdown = self._clean_markdown(markdown)
self._log_tuning("_process_chunks", { self.log_tuning("_process_chunks", {
"Chunk Number": f"{i + 1} of {len(chunks)}", "Chunk Number": f"{i + 1} of {len(chunks)}",
"Chunk": chunk, "Chunk": chunk,
"Previous Chunk": previous_part, "Previous Chunk": previous_part,

View File

@@ -1,3 +1,4 @@
import re
from datetime import datetime as dt, timezone as tz from datetime import datetime as dt, timezone as tz
from celery import states from celery import states
@@ -23,6 +24,8 @@ from common.utils.business_event_context import current_event
from config.type_defs.processor_types import PROCESSOR_TYPES from config.type_defs.processor_types import PROCESSOR_TYPES
from eveai_workers.processors.processor_registry import ProcessorRegistry from eveai_workers.processors.processor_registry import ProcessorRegistry
from common.utils.config_field_types import json_to_pattern_list
# Healthcheck task # Healthcheck task
@current_celery.task(name='ping', queue='embeddings') @current_celery.task(name='ping', queue='embeddings')
@@ -99,9 +102,13 @@ def create_embeddings(tenant_id, document_version_id):
processor=processor processor=processor
) )
markdown, title = document_processor.process() markdown, title = document_processor.process()
document_processor.log_tuning("Processor returned: ", {
'markdown': markdown,
'title': title
})
with current_event.create_span("Embedding"): with current_event.create_span("Embedding"):
embed_markdown(tenant, model_variables, document_version, catalog, markdown, title) embed_markdown(tenant, model_variables, document_version, catalog, document_processor, markdown, title)
current_event.log("Finished Embedding Creation Task") current_event.log("Finished Embedding Creation Task")
@@ -129,16 +136,19 @@ def delete_embeddings_for_document_version(document_version):
raise raise
def embed_markdown(tenant, model_variables, document_version, catalog, markdown, title): def embed_markdown(tenant, model_variables, document_version, catalog, processor, markdown, title):
# Create potential chunks # Create potential chunks
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, f"{document_version.id}.md") potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, processor, markdown)
processor.log_tuning("Potential Chunks: ", {'potential chunks': potential_chunks})
# Combine chunks for embedding # Combine chunks for embedding
chunks = combine_chunks_for_markdown(potential_chunks, catalog.min_chunk_size, catalog.max_chunk_size) chunks = combine_chunks_for_markdown(potential_chunks, catalog.min_chunk_size, catalog.max_chunk_size, processor)
processor.log_tuning("Chunks: ", {'chunks': chunks})
# Enrich chunks # Enrich chunks
with current_event.create_span("Enrich Chunks"): with current_event.create_span("Enrich Chunks"):
enriched_chunks = enrich_chunks(tenant, model_variables, document_version, title, chunks) enriched_chunks = enrich_chunks(tenant, model_variables, document_version, title, chunks)
processor.log_tuning("Enriched Chunks: ", {'enriched_chunks': enriched_chunks})
# Create embeddings # Create embeddings
with current_event.create_span("Create Embeddings"): with current_event.create_span("Create Embeddings"):
@@ -238,23 +248,17 @@ def embed_chunks(tenant, model_variables, document_version, chunks):
return new_embeddings return new_embeddings
def create_potential_chunks_for_markdown(tenant_id, document_version, input_file): def create_potential_chunks_for_markdown(tenant_id, document_version, processor, markdown):
try: try:
current_app.logger.info(f'Creating potential chunks for tenant {tenant_id}') current_app.logger.info(f'Creating potential chunks for tenant {tenant_id}')
markdown_on = document_version.object_name.rsplit('.', 1)[0] + '.md' heading_level = processor.configuration.get('chunking_heading_level', 2)
# Download the markdown file from MinIO
markdown_data = minio_client.download_document_file(tenant_id,
document_version.bucket_name,
markdown_on,
)
markdown = markdown_data.decode('utf-8')
headers_to_split_on = [ headers_to_split_on = [
("#", "Header 1"), (f"{'#' * i}", f"Header {i}") for i in range(1, min(heading_level + 1, 7))
("##", "Header 2"),
] ]
processor.log_tuning('Headers to split on', {'header list: ': headers_to_split_on})
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False) markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
md_header_splits = markdown_splitter.split_text(markdown) md_header_splits = markdown_splitter.split_text(markdown)
potential_chunks = [doc.page_content for doc in md_header_splits] potential_chunks = [doc.page_content for doc in md_header_splits]
@@ -265,14 +269,61 @@ def create_potential_chunks_for_markdown(tenant_id, document_version, input_file
raise raise
def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars): def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars, processor):
actual_chunks = [] actual_chunks = []
current_chunk = "" current_chunk = ""
current_length = 0 current_length = 0
def matches_chunking_pattern(text, patterns):
if not patterns:
return False
# Get the first line of the text
first_line = text.split('\n', 1)[0].strip()
# Check if it's a header at appropriate level
header_match = re.match(r'^(#{1,6})\s+(.+)$', first_line)
if not header_match:
return False
# Get the heading level (number of #s)
header_level = len(header_match.group(1))
# Get the header text
header_text = header_match.group(2)
# Check if header matches any pattern
for pattern in patterns:
try:
processor.log_tuning('Pattern check: ', {
'pattern: ': pattern,
'text': header_text
})
if re.search(pattern, header_text, re.IGNORECASE):
return True
except Exception as e:
current_app.logger.warning(f"Invalid regex pattern '{pattern}': {str(e)}")
continue
return False
chunking_patterns = json_to_pattern_list(processor.configuration.get('chunking_patterns', []))
processor.log_tuning(f'Chunking Patterns Extraction: ', {
'Full Configuration': processor.configuration,
'Chunking Patterns': chunking_patterns,
})
for chunk in potential_chunks: for chunk in potential_chunks:
chunk_length = len(chunk) chunk_length = len(chunk)
# Force new chunk if pattern matches
if chunking_patterns and matches_chunking_pattern(chunk, chunking_patterns):
if current_chunk and current_length >= min_chars:
actual_chunks.append(current_chunk)
current_chunk = chunk
current_length = chunk_length
continue
if current_length + chunk_length > max_chars: if current_length + chunk_length > max_chars:
if current_length >= min_chars: if current_length >= min_chars:
actual_chunks.append(current_chunk) actual_chunks.append(current_chunk)

View File

@@ -89,3 +89,4 @@ prometheus_flask_exporter~=0.23.1
prometheus_client~=0.20.0 prometheus_client~=0.20.0
babel~=2.16.0 babel~=2.16.0
dogpile.cache~=1.3.3 dogpile.cache~=1.3.3
python-docx~=1.1.2