- Addition of general chunking parameters chunking_heading_level and chunking patterns
- Addition of Processor types docx and markdown
This commit is contained in:
@@ -64,6 +64,20 @@ class TaggingFields(BaseModel):
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class ChunkingPatternsField(BaseModel):
|
||||||
|
"""Represents a set of chunking patterns"""
|
||||||
|
patterns: List[str]
|
||||||
|
|
||||||
|
@field_validator('patterns')
|
||||||
|
def validate_patterns(cls, patterns):
|
||||||
|
for pattern in patterns:
|
||||||
|
try:
|
||||||
|
re.compile(pattern)
|
||||||
|
except re.error as e:
|
||||||
|
raise ValueError(f"Invalid regex pattern '{pattern}': {str(e)}")
|
||||||
|
return patterns
|
||||||
|
|
||||||
|
|
||||||
class ArgumentConstraint(BaseModel):
|
class ArgumentConstraint(BaseModel):
|
||||||
"""Base class for all argument constraints"""
|
"""Base class for all argument constraints"""
|
||||||
description: Optional[str] = None
|
description: Optional[str] = None
|
||||||
@@ -611,3 +625,38 @@ def _generate_yaml_docs(fields: Dict[str, Any], version: str) -> str:
|
|||||||
}
|
}
|
||||||
|
|
||||||
return yaml.dump(doc, sort_keys=False, default_flow_style=False)
|
return yaml.dump(doc, sort_keys=False, default_flow_style=False)
|
||||||
|
|
||||||
|
|
||||||
|
def patterns_to_json(text_area_content: str) -> str:
|
||||||
|
"""Convert line-based patterns to JSON"""
|
||||||
|
text_area_content = text_area_content.strip()
|
||||||
|
if len(text_area_content) == 0:
|
||||||
|
return json.dumps([])
|
||||||
|
# Split on newlines and remove empty lines
|
||||||
|
patterns = [line.strip() for line in text_area_content.split('\n') if line.strip()]
|
||||||
|
return json.dumps(patterns)
|
||||||
|
|
||||||
|
|
||||||
|
def json_to_patterns(json_content: str) -> str:
|
||||||
|
"""Convert JSON patterns list to text area content"""
|
||||||
|
try:
|
||||||
|
patterns = json.loads(json_content)
|
||||||
|
if not isinstance(patterns, list):
|
||||||
|
raise ValueError("JSON must contain a list of patterns")
|
||||||
|
# Join with newlines
|
||||||
|
return '\n'.join(patterns)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
raise ValueError(f"Invalid JSON format: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def json_to_pattern_list(json_content: str) -> list:
|
||||||
|
"""Convert JSON patterns list to text area content"""
|
||||||
|
try:
|
||||||
|
patterns = json.loads(json_content)
|
||||||
|
if not isinstance(patterns, list):
|
||||||
|
raise ValueError("JSON must contain a list of patterns")
|
||||||
|
# Unescape if needed
|
||||||
|
patterns = [pattern.replace('\\\\', '\\') for pattern in patterns]
|
||||||
|
return patterns
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
raise ValueError(f"Invalid JSON format: {e}")
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ import requests
|
|||||||
from urllib.parse import urlparse, unquote, urlunparse
|
from urllib.parse import urlparse, unquote, urlunparse
|
||||||
import os
|
import os
|
||||||
from .eveai_exceptions import (EveAIInvalidLanguageException, EveAIDoubleURLException, EveAIUnsupportedFileType,
|
from .eveai_exceptions import (EveAIInvalidLanguageException, EveAIDoubleURLException, EveAIUnsupportedFileType,
|
||||||
EveAIInvalidCatalog, EveAIInvalidDocument, EveAIInvalidDocumentVersion)
|
EveAIInvalidCatalog, EveAIInvalidDocument, EveAIInvalidDocumentVersion, EveAIException)
|
||||||
from ..models.user import Tenant
|
from ..models.user import Tenant
|
||||||
|
|
||||||
|
|
||||||
@@ -219,12 +219,6 @@ def start_embedding_task(tenant_id, doc_vers_id):
|
|||||||
return task.id
|
return task.id
|
||||||
|
|
||||||
|
|
||||||
def validate_file_type(extension):
|
|
||||||
if extension not in current_app.config['SUPPORTED_FILE_TYPES']:
|
|
||||||
raise EveAIUnsupportedFileType(f"Filetype {extension} is currently not supported. "
|
|
||||||
f"Supported filetypes: {', '.join(current_app.config['SUPPORTED_FILE_TYPES'])}")
|
|
||||||
|
|
||||||
|
|
||||||
def get_filename_from_url(url):
|
def get_filename_from_url(url):
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
path_parts = parsed_url.path.split('/')
|
path_parts = parsed_url.path.split('/')
|
||||||
@@ -363,3 +357,109 @@ def cope_with_local_url(url):
|
|||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
def lookup_document(tenant_id: int, lookup_criteria: dict, metadata_type: str) -> tuple[Document, DocumentVersion]:
|
||||||
|
"""
|
||||||
|
Look up a document using metadata criteria
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tenant_id: ID of the tenant
|
||||||
|
lookup_criteria: Dictionary of key-value pairs to match in metadata
|
||||||
|
metadata_type: Which metadata to search in ('user_metadata' or 'system_metadata')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (Document, DocumentVersion) if found
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If invalid metadata_type provided
|
||||||
|
EveAIException: If lookup fails
|
||||||
|
"""
|
||||||
|
if metadata_type not in ['user_metadata', 'system_metadata']:
|
||||||
|
raise ValueError(f"Invalid metadata_type: {metadata_type}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Query for the latest document version matching the criteria
|
||||||
|
query = (db.session.query(Document, DocumentVersion)
|
||||||
|
.join(DocumentVersion)
|
||||||
|
.filter(Document.id == DocumentVersion.doc_id)
|
||||||
|
.order_by(DocumentVersion.id.desc()))
|
||||||
|
|
||||||
|
# Add metadata filtering using PostgreSQL JSONB operators
|
||||||
|
metadata_field = getattr(DocumentVersion, metadata_type)
|
||||||
|
for key, value in lookup_criteria.items():
|
||||||
|
query = query.filter(metadata_field[key].astext == str(value))
|
||||||
|
|
||||||
|
# Get first result
|
||||||
|
result = query.first()
|
||||||
|
|
||||||
|
if not result:
|
||||||
|
raise EveAIException(
|
||||||
|
f"No document found matching criteria in {metadata_type}",
|
||||||
|
status_code=404
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
except SQLAlchemyError as e:
|
||||||
|
current_app.logger.error(f'Database error during document lookup for tenant {tenant_id}: {e}')
|
||||||
|
raise EveAIException(
|
||||||
|
"Database error during document lookup",
|
||||||
|
status_code=500
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
current_app.logger.error(f'Error during document lookup for tenant {tenant_id}: {e}')
|
||||||
|
raise EveAIException(
|
||||||
|
"Error during document lookup",
|
||||||
|
status_code=500
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Add to common/utils/document_utils.py
|
||||||
|
|
||||||
|
def refresh_document_with_content(doc_id: int, tenant_id: int, file_content: bytes, api_input: dict) -> tuple:
|
||||||
|
"""
|
||||||
|
Refresh document with new content
|
||||||
|
|
||||||
|
Args:
|
||||||
|
doc_id: Document ID
|
||||||
|
tenant_id: Tenant ID
|
||||||
|
file_content: New file content
|
||||||
|
api_input: Additional document information
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (new_version, task_id)
|
||||||
|
"""
|
||||||
|
doc = Document.query.get(doc_id)
|
||||||
|
if not doc:
|
||||||
|
raise EveAIInvalidDocument(tenant_id, doc_id)
|
||||||
|
|
||||||
|
old_doc_vers = DocumentVersion.query.filter_by(doc_id=doc_id).order_by(desc(DocumentVersion.id)).first()
|
||||||
|
|
||||||
|
# Create new version with same file type as original
|
||||||
|
extension = old_doc_vers.file_type
|
||||||
|
|
||||||
|
new_doc_vers = create_version_for_document(
|
||||||
|
doc, tenant_id,
|
||||||
|
'', # No URL for content-based updates
|
||||||
|
old_doc_vers.sub_file_type,
|
||||||
|
api_input.get('language', old_doc_vers.language),
|
||||||
|
api_input.get('user_context', old_doc_vers.user_context),
|
||||||
|
api_input.get('user_metadata', old_doc_vers.user_metadata),
|
||||||
|
api_input.get('catalog_properties', old_doc_vers.catalog_properties),
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
db.session.add(new_doc_vers)
|
||||||
|
db.session.commit()
|
||||||
|
except SQLAlchemyError as e:
|
||||||
|
db.session.rollback()
|
||||||
|
return None, str(e)
|
||||||
|
|
||||||
|
# Upload new content
|
||||||
|
upload_file_for_version(new_doc_vers, file_content, extension, tenant_id)
|
||||||
|
|
||||||
|
# Start embedding task
|
||||||
|
task = current_celery.send_task('create_embeddings', args=[tenant_id, new_doc_vers.id], queue='embeddings')
|
||||||
|
current_app.logger.info(f'Embedding creation started for document {doc_id} on version {new_doc_vers.id} '
|
||||||
|
f'with task id: {task.id}.')
|
||||||
|
|
||||||
|
return new_doc_vers, task.id
|
||||||
|
|||||||
@@ -55,7 +55,6 @@ class Config(object):
|
|||||||
|
|
||||||
# file upload settings
|
# file upload settings
|
||||||
MAX_CONTENT_LENGTH = 50 * 1024 * 1024
|
MAX_CONTENT_LENGTH = 50 * 1024 * 1024
|
||||||
UPLOAD_EXTENSIONS = ['.txt', '.pdf', '.png', '.jpg', '.jpeg', '.gif']
|
|
||||||
|
|
||||||
# supported languages
|
# supported languages
|
||||||
SUPPORTED_LANGUAGES = ['en', 'fr', 'nl', 'de', 'es']
|
SUPPORTED_LANGUAGES = ['en', 'fr', 'nl', 'de', 'es']
|
||||||
@@ -143,10 +142,7 @@ class Config(object):
|
|||||||
LANGCHAIN_ENDPOINT = 'https://api.smith.langchain.com'
|
LANGCHAIN_ENDPOINT = 'https://api.smith.langchain.com'
|
||||||
LANGCHAIN_PROJECT = "eveai"
|
LANGCHAIN_PROJECT = "eveai"
|
||||||
|
|
||||||
|
TENANT_TYPES = ['Active', 'Demo', 'Inactive', 'Test']
|
||||||
SUPPORTED_FILE_TYPES = ['pdf', 'html', 'md', 'txt', 'mp3', 'mp4', 'ogg', 'srt']
|
|
||||||
|
|
||||||
TENANT_TYPES = ['Active', 'Demo', 'Inactive', 'Test', 'Wordpress Starter']
|
|
||||||
|
|
||||||
# The maximum number of seconds allowed for audio compression (to save resources)
|
# The maximum number of seconds allowed for audio compression (to save resources)
|
||||||
MAX_COMPRESSION_DURATION = 60*10 # 10 minutes
|
MAX_COMPRESSION_DURATION = 60*10 # 10 minutes
|
||||||
|
|||||||
@@ -5,6 +5,19 @@ PROCESSOR_TYPES = {
|
|||||||
"file_types": "html",
|
"file_types": "html",
|
||||||
"Description": "A processor for HTML files",
|
"Description": "A processor for HTML files",
|
||||||
"configuration": {
|
"configuration": {
|
||||||
|
"chunking_patterns": {
|
||||||
|
"name": "Chunking Patterns",
|
||||||
|
"description": "A list of Patterns used to chunk files into logical pieces",
|
||||||
|
"type": "chunking_patterns",
|
||||||
|
"required": False
|
||||||
|
},
|
||||||
|
"chunking_heading_level": {
|
||||||
|
"name": "Chunking Heading Level",
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Maximum heading level to consider for chunking (1-6)",
|
||||||
|
"required": False,
|
||||||
|
"default": 2
|
||||||
|
},
|
||||||
"html_tags": {
|
"html_tags": {
|
||||||
"name": "HTML Tags",
|
"name": "HTML Tags",
|
||||||
"type": "string",
|
"type": "string",
|
||||||
@@ -45,7 +58,21 @@ PROCESSOR_TYPES = {
|
|||||||
"name": "PDF Processor",
|
"name": "PDF Processor",
|
||||||
"file_types": "pdf",
|
"file_types": "pdf",
|
||||||
"Description": "A Processor for PDF files",
|
"Description": "A Processor for PDF files",
|
||||||
"configuration": {}
|
"configuration": {
|
||||||
|
"chunking_patterns": {
|
||||||
|
"name": "Chunking Patterns",
|
||||||
|
"description": "A list of Patterns used to chunk files into logical pieces",
|
||||||
|
"type": "chunking_patterns",
|
||||||
|
"required": False
|
||||||
|
},
|
||||||
|
"chunking_heading_level": {
|
||||||
|
"name": "Chunking Heading Level",
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Maximum heading level to consider for chunking (1-6)",
|
||||||
|
"required": False,
|
||||||
|
"default": 2
|
||||||
|
},
|
||||||
|
},
|
||||||
},
|
},
|
||||||
"AUDIO_PROCESSOR": {
|
"AUDIO_PROCESSOR": {
|
||||||
"name": "AUDIO Processor",
|
"name": "AUDIO Processor",
|
||||||
@@ -53,4 +80,89 @@ PROCESSOR_TYPES = {
|
|||||||
"Description": "A Processor for audio files",
|
"Description": "A Processor for audio files",
|
||||||
"configuration": {}
|
"configuration": {}
|
||||||
},
|
},
|
||||||
|
"MARKDOWN_PROCESSOR": {
|
||||||
|
"name": "Markdown Processor",
|
||||||
|
"file_types": "md",
|
||||||
|
"Description": "A Processor for markdown files",
|
||||||
|
"configuration": {
|
||||||
|
"chunking_patterns": {
|
||||||
|
"name": "Chunking Patterns",
|
||||||
|
"description": "A list of Patterns used to chunk files into logical pieces",
|
||||||
|
"type": "chunking_patterns",
|
||||||
|
"required": False
|
||||||
|
},
|
||||||
|
"chunking_heading_level": {
|
||||||
|
"name": "Chunking Heading Level",
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Maximum heading level to consider for chunking (1-6)",
|
||||||
|
"required": False,
|
||||||
|
"default": 2
|
||||||
|
},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"DOCX_PROCESSOR": {
|
||||||
|
"name": "DOCX Processor",
|
||||||
|
"file_types": "docx",
|
||||||
|
"Description": "A processor for DOCX files",
|
||||||
|
"configuration": {
|
||||||
|
"chunking_patterns": {
|
||||||
|
"name": "Chunking Patterns",
|
||||||
|
"description": "A list of Patterns used to chunk files into logical pieces",
|
||||||
|
"type": "chunking_patterns",
|
||||||
|
"required": False
|
||||||
|
},
|
||||||
|
"chunking_heading_level": {
|
||||||
|
"name": "Chunking Heading Level",
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Maximum heading level to consider for chunking (1-6)",
|
||||||
|
"required": False,
|
||||||
|
"default": 2
|
||||||
|
},
|
||||||
|
"extract_comments": {
|
||||||
|
"name": "Extract Comments",
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Whether to include document comments in the markdown",
|
||||||
|
"required": False,
|
||||||
|
"default": False
|
||||||
|
},
|
||||||
|
"extract_headers_footers": {
|
||||||
|
"name": "Extract Headers/Footers",
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Whether to include headers and footers in the markdown",
|
||||||
|
"required": False,
|
||||||
|
"default": False
|
||||||
|
},
|
||||||
|
"preserve_formatting": {
|
||||||
|
"name": "Preserve Formatting",
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Whether to preserve bold, italic, and other text formatting",
|
||||||
|
"required": False,
|
||||||
|
"default": True
|
||||||
|
},
|
||||||
|
"list_style": {
|
||||||
|
"name": "List Style",
|
||||||
|
"type": "enum",
|
||||||
|
"description": "How to format lists in markdown",
|
||||||
|
"required": False,
|
||||||
|
"default": "dash",
|
||||||
|
"allowed_values": ["dash", "asterisk", "plus"]
|
||||||
|
},
|
||||||
|
"image_handling": {
|
||||||
|
"name": "Image Handling",
|
||||||
|
"type": "enum",
|
||||||
|
"description": "How to handle embedded images",
|
||||||
|
"required": False,
|
||||||
|
"default": "skip",
|
||||||
|
"allowed_values": ["skip", "extract", "placeholder"]
|
||||||
|
},
|
||||||
|
"table_alignment": {
|
||||||
|
"name": "Table Alignment",
|
||||||
|
"type": "enum",
|
||||||
|
"description": "How to align table contents",
|
||||||
|
"required": False,
|
||||||
|
"default": "left",
|
||||||
|
"allowed_values": ["left", "center", "preserve"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,9 +11,10 @@ from common.utils.document_utils import (
|
|||||||
create_document_stack, process_url, start_embedding_task,
|
create_document_stack, process_url, start_embedding_task,
|
||||||
validate_file_type, EveAIInvalidLanguageException, EveAIDoubleURLException, EveAIUnsupportedFileType,
|
validate_file_type, EveAIInvalidLanguageException, EveAIDoubleURLException, EveAIUnsupportedFileType,
|
||||||
get_documents_list, edit_document, refresh_document, edit_document_version,
|
get_documents_list, edit_document, refresh_document, edit_document_version,
|
||||||
refresh_document_with_info
|
refresh_document_with_info, lookup_document
|
||||||
)
|
)
|
||||||
from common.utils.eveai_exceptions import EveAIException
|
from common.utils.eveai_exceptions import EveAIException
|
||||||
|
from eveai_api.api.auth import requires_service
|
||||||
|
|
||||||
|
|
||||||
def validate_date(date_str):
|
def validate_date(date_str):
|
||||||
@@ -59,6 +60,7 @@ add_document_response = document_ns.model('AddDocumentResponse', {
|
|||||||
@document_ns.route('/add_document')
|
@document_ns.route('/add_document')
|
||||||
class AddDocument(Resource):
|
class AddDocument(Resource):
|
||||||
@jwt_required()
|
@jwt_required()
|
||||||
|
@requires_service('DOCAPI')
|
||||||
@document_ns.expect(upload_parser)
|
@document_ns.expect(upload_parser)
|
||||||
@document_ns.response(201, 'Document added successfully', add_document_response)
|
@document_ns.response(201, 'Document added successfully', add_document_response)
|
||||||
@document_ns.response(400, 'Validation Error')
|
@document_ns.response(400, 'Validation Error')
|
||||||
@@ -134,6 +136,7 @@ add_url_response = document_ns.model('AddURLResponse', {
|
|||||||
@document_ns.route('/add_url')
|
@document_ns.route('/add_url')
|
||||||
class AddURL(Resource):
|
class AddURL(Resource):
|
||||||
@jwt_required()
|
@jwt_required()
|
||||||
|
@requires_service('DOCAPI')
|
||||||
@document_ns.expect(add_url_model)
|
@document_ns.expect(add_url_model)
|
||||||
@document_ns.response(201, 'Document added successfully', add_url_response)
|
@document_ns.response(201, 'Document added successfully', add_url_response)
|
||||||
@document_ns.response(400, 'Validation Error')
|
@document_ns.response(400, 'Validation Error')
|
||||||
@@ -190,6 +193,7 @@ document_list_model = document_ns.model('DocumentList', {
|
|||||||
@document_ns.route('/list')
|
@document_ns.route('/list')
|
||||||
class DocumentList(Resource):
|
class DocumentList(Resource):
|
||||||
@jwt_required()
|
@jwt_required()
|
||||||
|
@requires_service('DOCAPI')
|
||||||
@document_ns.doc('list_documents')
|
@document_ns.doc('list_documents')
|
||||||
@document_ns.marshal_list_with(document_list_model, envelope='documents')
|
@document_ns.marshal_list_with(document_list_model, envelope='documents')
|
||||||
def get(self):
|
def get(self):
|
||||||
@@ -210,6 +214,7 @@ edit_document_model = document_ns.model('EditDocument', {
|
|||||||
@document_ns.route('/<int:document_id>')
|
@document_ns.route('/<int:document_id>')
|
||||||
class DocumentResource(Resource):
|
class DocumentResource(Resource):
|
||||||
@jwt_required()
|
@jwt_required()
|
||||||
|
@requires_service('DOCAPI')
|
||||||
@document_ns.doc('edit_document')
|
@document_ns.doc('edit_document')
|
||||||
@document_ns.expect(edit_document_model)
|
@document_ns.expect(edit_document_model)
|
||||||
@document_ns.response(200, 'Document updated successfully')
|
@document_ns.response(200, 'Document updated successfully')
|
||||||
@@ -232,6 +237,7 @@ class DocumentResource(Resource):
|
|||||||
return e.to_dict(), e.status_code
|
return e.to_dict(), e.status_code
|
||||||
|
|
||||||
@jwt_required()
|
@jwt_required()
|
||||||
|
@requires_service('DOCAPI')
|
||||||
@document_ns.doc('refresh_document')
|
@document_ns.doc('refresh_document')
|
||||||
@document_ns.response(200, 'Document refreshed successfully')
|
@document_ns.response(200, 'Document refreshed successfully')
|
||||||
def post(self, document_id):
|
def post(self, document_id):
|
||||||
@@ -253,6 +259,7 @@ edit_document_version_model = document_ns.model('EditDocumentVersion', {
|
|||||||
@document_ns.route('/version/<int:version_id>')
|
@document_ns.route('/version/<int:version_id>')
|
||||||
class DocumentVersionResource(Resource):
|
class DocumentVersionResource(Resource):
|
||||||
@jwt_required()
|
@jwt_required()
|
||||||
|
@requires_service('DOCAPI')
|
||||||
@document_ns.doc('edit_document_version')
|
@document_ns.doc('edit_document_version')
|
||||||
@document_ns.expect(edit_document_version_model)
|
@document_ns.expect(edit_document_version_model)
|
||||||
@document_ns.response(200, 'Document version updated successfully')
|
@document_ns.response(200, 'Document version updated successfully')
|
||||||
@@ -280,6 +287,7 @@ refresh_document_model = document_ns.model('RefreshDocument', {
|
|||||||
@document_ns.route('/<int:document_id>/refresh')
|
@document_ns.route('/<int:document_id>/refresh')
|
||||||
class RefreshDocument(Resource):
|
class RefreshDocument(Resource):
|
||||||
@jwt_required()
|
@jwt_required()
|
||||||
|
@requires_service('DOCAPI')
|
||||||
@document_ns.response(200, 'Document refreshed successfully')
|
@document_ns.response(200, 'Document refreshed successfully')
|
||||||
@document_ns.response(404, 'Document not found')
|
@document_ns.response(404, 'Document not found')
|
||||||
def post(self, document_id):
|
def post(self, document_id):
|
||||||
@@ -310,6 +318,7 @@ class RefreshDocument(Resource):
|
|||||||
@document_ns.route('/<int:document_id>/refresh_with_info')
|
@document_ns.route('/<int:document_id>/refresh_with_info')
|
||||||
class RefreshDocumentWithInfo(Resource):
|
class RefreshDocumentWithInfo(Resource):
|
||||||
@jwt_required()
|
@jwt_required()
|
||||||
|
@requires_service('DOCAPI')
|
||||||
@document_ns.expect(refresh_document_model)
|
@document_ns.expect(refresh_document_model)
|
||||||
@document_ns.response(200, 'Document refreshed successfully')
|
@document_ns.response(200, 'Document refreshed successfully')
|
||||||
@document_ns.response(400, 'Validation Error')
|
@document_ns.response(400, 'Validation Error')
|
||||||
@@ -338,3 +347,112 @@ class RefreshDocumentWithInfo(Resource):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
current_app.logger.error(f'Error refreshing document with info: {str(e)}')
|
current_app.logger.error(f'Error refreshing document with info: {str(e)}')
|
||||||
return {'message': 'Internal server error'}, 500
|
return {'message': 'Internal server error'}, 500
|
||||||
|
|
||||||
|
|
||||||
|
# Define models for lookup requests
|
||||||
|
lookup_model = document_ns.model('DocumentLookup', {
|
||||||
|
'lookup_criteria': fields.Raw(required=True,
|
||||||
|
description='JSON object containing key-value pairs to match in metadata. '
|
||||||
|
'Example: {"external_id": "123", "source": "zapier", "source_type": "google_docs"}'),
|
||||||
|
'metadata_type': fields.String(required=True, enum=['user_metadata', 'system_metadata'],
|
||||||
|
description='Which metadata field to search in')
|
||||||
|
})
|
||||||
|
|
||||||
|
lookup_response = document_ns.model('DocumentLookupResponse', {
|
||||||
|
'document_id': fields.Integer(description='ID of the found document'),
|
||||||
|
'document_version_id': fields.Integer(description='ID of the latest document version'),
|
||||||
|
'name': fields.String(description='Document name'),
|
||||||
|
'metadata': fields.Raw(description='Full metadata of the found document')
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
@document_ns.route('/lookup')
|
||||||
|
class DocumentLookup(Resource):
|
||||||
|
@jwt_required()
|
||||||
|
@requires_service('DOCAPI')
|
||||||
|
@document_ns.expect(lookup_model)
|
||||||
|
@document_ns.marshal_with(lookup_response)
|
||||||
|
@document_ns.response(200, 'Document found', lookup_response)
|
||||||
|
@document_ns.response(404, 'No document found matching criteria')
|
||||||
|
def post(self):
|
||||||
|
"""
|
||||||
|
Look up a document using metadata criteria
|
||||||
|
"""
|
||||||
|
tenant_id = get_jwt_identity()
|
||||||
|
try:
|
||||||
|
data = request.json
|
||||||
|
document, version = lookup_document(
|
||||||
|
tenant_id,
|
||||||
|
data['lookup_criteria'],
|
||||||
|
data['metadata_type']
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'document_id': document.id,
|
||||||
|
'document_version_id': version.id,
|
||||||
|
'name': document.name,
|
||||||
|
'metadata': getattr(version, data['metadata_type'])
|
||||||
|
}
|
||||||
|
|
||||||
|
except EveAIException as e:
|
||||||
|
return e.to_dict(), e.status_code
|
||||||
|
|
||||||
|
except KeyError as e:
|
||||||
|
return {'message': f'Missing required field: {str(e)}'}, 400
|
||||||
|
|
||||||
|
|
||||||
|
refresh_content_model = document_ns.model('RefreshDocumentContent', {
|
||||||
|
'file_content': fields.Raw(required=True, description='The new file content'),
|
||||||
|
'language': fields.String(required=False, description='Language of the document'),
|
||||||
|
'user_context': fields.String(required=False, description='User context for the document'),
|
||||||
|
'user_metadata': fields.Raw(required=False, description='Custom metadata fields'),
|
||||||
|
'catalog_properties': fields.Raw(required=False, description='Catalog-specific properties'),
|
||||||
|
'trigger_service': fields.String(required=False, description='Service that triggered the update')
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
@document_ns.route('/<int:document_id>/refresh_content')
|
||||||
|
class RefreshDocumentContent(Resource):
|
||||||
|
@jwt_required()
|
||||||
|
@requires_service('DOCAPI')
|
||||||
|
@document_ns.expect(refresh_content_model)
|
||||||
|
@document_ns.response(200, 'Document refreshed successfully')
|
||||||
|
def post(self, document_id):
|
||||||
|
"""Refresh a document with new content"""
|
||||||
|
tenant_id = get_jwt_identity()
|
||||||
|
try:
|
||||||
|
data = request.json
|
||||||
|
file_content = data['file_content']
|
||||||
|
|
||||||
|
# Build user_metadata by merging:
|
||||||
|
# 1. Existing metadata (if any)
|
||||||
|
# 2. New metadata from request
|
||||||
|
# 3. Zapier-specific fields
|
||||||
|
user_metadata = data.get('user_metadata', {})
|
||||||
|
user_metadata.update({
|
||||||
|
'source': 'zapier',
|
||||||
|
'trigger_service': data.get('trigger_service')
|
||||||
|
})
|
||||||
|
data['user_metadata'] = user_metadata
|
||||||
|
|
||||||
|
# Keep catalog_properties separate
|
||||||
|
if 'catalog_properties' in data:
|
||||||
|
# We could add validation here against catalog configuration
|
||||||
|
data['catalog_properties'] = data['catalog_properties']
|
||||||
|
|
||||||
|
new_version, task_id = refresh_document_with_content(
|
||||||
|
document_id,
|
||||||
|
tenant_id,
|
||||||
|
file_content,
|
||||||
|
data
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'message': f'Document refreshed successfully. New version: {new_version.id}. Task ID: {task_id}',
|
||||||
|
'document_id': document_id,
|
||||||
|
'document_version_id': new_version.id,
|
||||||
|
'task_id': task_id
|
||||||
|
}, 200
|
||||||
|
|
||||||
|
except EveAIException as e:
|
||||||
|
return e.to_dict(), e.status_code
|
||||||
|
|||||||
@@ -15,14 +15,6 @@ from config.type_defs.retriever_types import RETRIEVER_TYPES
|
|||||||
from .dynamic_form_base import DynamicFormBase
|
from .dynamic_form_base import DynamicFormBase
|
||||||
|
|
||||||
|
|
||||||
def allowed_file(form, field):
|
|
||||||
if field.data:
|
|
||||||
filename = field.data.filename
|
|
||||||
allowed_extensions = current_app.config.get('SUPPORTED_FILE_TYPES', [])
|
|
||||||
if not ('.' in filename and filename.rsplit('.', 1)[1].lower() in allowed_extensions):
|
|
||||||
raise ValidationError('Unsupported file type.')
|
|
||||||
|
|
||||||
|
|
||||||
def validate_json(form, field):
|
def validate_json(form, field):
|
||||||
if field.data:
|
if field.data:
|
||||||
try:
|
try:
|
||||||
@@ -101,7 +93,10 @@ class ProcessorForm(FlaskForm):
|
|||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
# Dynamically populate the 'type' field using the constructor
|
# Dynamically populate the 'type' field using the constructor
|
||||||
self.type.choices = [(key, value['name']) for key, value in PROCESSOR_TYPES.items()]
|
self.type.choices = sorted(
|
||||||
|
[(key, value['name']) for key, value in PROCESSOR_TYPES.items()],
|
||||||
|
key=lambda x: x[1],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class EditProcessorForm(DynamicFormBase):
|
class EditProcessorForm(DynamicFormBase):
|
||||||
@@ -177,7 +172,7 @@ class EditRetrieverForm(DynamicFormBase):
|
|||||||
|
|
||||||
|
|
||||||
class AddDocumentForm(DynamicFormBase):
|
class AddDocumentForm(DynamicFormBase):
|
||||||
file = FileField('File', validators=[FileRequired(), allowed_file])
|
file = FileField('File', validators=[FileRequired()])
|
||||||
catalog = StringField('Catalog', render_kw={'readonly': True})
|
catalog = StringField('Catalog', render_kw={'readonly': True})
|
||||||
sub_file_type = StringField('Sub File Type', validators=[Optional(), Length(max=50)])
|
sub_file_type = StringField('Sub File Type', validators=[Optional(), Length(max=50)])
|
||||||
name = StringField('Name', validators=[Length(max=100)])
|
name = StringField('Name', validators=[Length(max=100)])
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ import json
|
|||||||
from common.models.document import Document, DocumentVersion, Catalog, Retriever, Processor
|
from common.models.document import Document, DocumentVersion, Catalog, Retriever, Processor
|
||||||
from common.extensions import db
|
from common.extensions import db
|
||||||
from common.models.interaction import Specialist, SpecialistRetriever
|
from common.models.interaction import Specialist, SpecialistRetriever
|
||||||
from common.utils.document_utils import validate_file_type, create_document_stack, start_embedding_task, process_url, \
|
from common.utils.document_utils import create_document_stack, start_embedding_task, process_url, \
|
||||||
edit_document, \
|
edit_document, \
|
||||||
edit_document_version, refresh_document
|
edit_document_version, refresh_document
|
||||||
from common.utils.eveai_exceptions import EveAIInvalidLanguageException, EveAIUnsupportedFileType, \
|
from common.utils.eveai_exceptions import EveAIInvalidLanguageException, EveAIUnsupportedFileType, \
|
||||||
@@ -391,9 +391,6 @@ def add_document():
|
|||||||
sub_file_type = form.sub_file_type.data
|
sub_file_type = form.sub_file_type.data
|
||||||
filename = secure_filename(file.filename)
|
filename = secure_filename(file.filename)
|
||||||
extension = filename.rsplit('.', 1)[1].lower()
|
extension = filename.rsplit('.', 1)[1].lower()
|
||||||
|
|
||||||
validate_file_type(extension)
|
|
||||||
|
|
||||||
catalog_properties = {}
|
catalog_properties = {}
|
||||||
document_version_configurations = CATALOG_TYPES[catalog.type]['document_version_configurations']
|
document_version_configurations = CATALOG_TYPES[catalog.type]['document_version_configurations']
|
||||||
for config in document_version_configurations:
|
for config in document_version_configurations:
|
||||||
|
|||||||
@@ -5,7 +5,46 @@ import json
|
|||||||
|
|
||||||
from wtforms.fields.choices import SelectField
|
from wtforms.fields.choices import SelectField
|
||||||
from wtforms.fields.datetime import DateField
|
from wtforms.fields.datetime import DateField
|
||||||
from common.utils.config_field_types import TaggingFields
|
from common.utils.config_field_types import TaggingFields, json_to_patterns, patterns_to_json
|
||||||
|
|
||||||
|
|
||||||
|
class TaggingFieldsField(TextAreaField):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
kwargs['render_kw'] = {
|
||||||
|
'class': 'chunking-patterns-field',
|
||||||
|
'data-handle-enter': 'true'
|
||||||
|
}
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
# def _value(self):
|
||||||
|
# if self.data:
|
||||||
|
# return json.dumps(self.data)
|
||||||
|
# return ''
|
||||||
|
#
|
||||||
|
# def process_formdata(self, valuelist):
|
||||||
|
# if valuelist and valuelist[0]:
|
||||||
|
# try:
|
||||||
|
# self.data = json.loads(valuelist[0])
|
||||||
|
# except json.JSONDecodeError as e:
|
||||||
|
# raise ValueError('Not valid JSON content')
|
||||||
|
|
||||||
|
|
||||||
|
class ChunkingPatternsField(TextAreaField):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
kwargs['render_kw'] = {
|
||||||
|
'class': 'chunking-patterns-field',
|
||||||
|
'data-handle-enter': 'true'
|
||||||
|
}
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
# def _value(self):
|
||||||
|
# if self.data:
|
||||||
|
# return '\n'.join(self.data)
|
||||||
|
# return ''
|
||||||
|
#
|
||||||
|
# def process_formdata(self, valuelist):
|
||||||
|
# if valuelist and valuelist[0]:
|
||||||
|
# self.data = [line.strip() for line in valuelist[0].split('\n') if line.strip()]
|
||||||
|
|
||||||
|
|
||||||
class DynamicFormBase(FlaskForm):
|
class DynamicFormBase(FlaskForm):
|
||||||
@@ -80,7 +119,7 @@ class DynamicFormBase(FlaskForm):
|
|||||||
|
|
||||||
# Handle special case for tagging_fields
|
# Handle special case for tagging_fields
|
||||||
if field_type == 'tagging_fields':
|
if field_type == 'tagging_fields':
|
||||||
field_class = TextAreaField
|
field_class = TaggingFieldsField
|
||||||
extra_classes = 'json-editor'
|
extra_classes = 'json-editor'
|
||||||
field_kwargs = {}
|
field_kwargs = {}
|
||||||
elif field_type == 'enum':
|
elif field_type == 'enum':
|
||||||
@@ -89,6 +128,10 @@ class DynamicFormBase(FlaskForm):
|
|||||||
choices = [(str(val), str(val)) for val in allowed_values]
|
choices = [(str(val), str(val)) for val in allowed_values]
|
||||||
extra_classes = ''
|
extra_classes = ''
|
||||||
field_kwargs = {'choices': choices}
|
field_kwargs = {'choices': choices}
|
||||||
|
elif field_type == 'chunking_patterns':
|
||||||
|
field_class = ChunkingPatternsField
|
||||||
|
extra_classes = ['monospace-text', 'pattern-input']
|
||||||
|
field_kwargs = {}
|
||||||
else:
|
else:
|
||||||
extra_classes = ''
|
extra_classes = ''
|
||||||
field_class = {
|
field_class = {
|
||||||
@@ -111,6 +154,12 @@ class DynamicFormBase(FlaskForm):
|
|||||||
except (TypeError, ValueError) as e:
|
except (TypeError, ValueError) as e:
|
||||||
current_app.logger.error(f"Error converting initial data to JSON: {e}")
|
current_app.logger.error(f"Error converting initial data to JSON: {e}")
|
||||||
field_data = "{}"
|
field_data = "{}"
|
||||||
|
elif field_type == 'chunking_patterns':
|
||||||
|
try:
|
||||||
|
field_data = json_to_patterns(field_data)
|
||||||
|
except (TypeError, ValueError) as e:
|
||||||
|
current_app.logger.error(f"Error converting initial data to a list of patterns: {e}")
|
||||||
|
field_data = {}
|
||||||
elif default is not None:
|
elif default is not None:
|
||||||
field_data = default
|
field_data = default
|
||||||
|
|
||||||
@@ -173,12 +222,17 @@ class DynamicFormBase(FlaskForm):
|
|||||||
original_field_name = full_field_name[prefix_length:]
|
original_field_name = full_field_name[prefix_length:]
|
||||||
field = getattr(self, full_field_name)
|
field = getattr(self, full_field_name)
|
||||||
# Parse JSON for tagging_fields type
|
# Parse JSON for tagging_fields type
|
||||||
if isinstance(field, TextAreaField) and field.data:
|
if isinstance(field, TaggingFieldsField) and field.data:
|
||||||
try:
|
try:
|
||||||
data[original_field_name] = json.loads(field.data)
|
data[original_field_name] = json.loads(field.data)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
# Validation should catch this, but just in case
|
# Validation should catch this, but just in case
|
||||||
data[original_field_name] = field.data
|
data[original_field_name] = field.data
|
||||||
|
elif isinstance(field, ChunkingPatternsField):
|
||||||
|
try:
|
||||||
|
data[original_field_name] = patterns_to_json(field.data)
|
||||||
|
except Exception as e:
|
||||||
|
current_app.logger.error(f"Error converting initial data to patterns: {e}")
|
||||||
else:
|
else:
|
||||||
data[original_field_name] = field.data
|
data[original_field_name] = field.data
|
||||||
return data
|
return data
|
||||||
@@ -230,5 +284,3 @@ def validate_tagging_fields(form, field):
|
|||||||
except (TypeError, ValueError) as e:
|
except (TypeError, ValueError) as e:
|
||||||
raise ValidationError(f"Invalid field definition: {str(e)}")
|
raise ValidationError(f"Invalid field definition: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ class AudioProcessor(TranscriptionBaseProcessor):
|
|||||||
try:
|
try:
|
||||||
audio_info = AudioSegment.from_file(temp_file_path, format=self.document_version.file_type)
|
audio_info = AudioSegment.from_file(temp_file_path, format=self.document_version.file_type)
|
||||||
total_duration = len(audio_info)
|
total_duration = len(audio_info)
|
||||||
self._log_tuning("_compress_audio", {
|
self.log_tuning("_compress_audio", {
|
||||||
"Audio Duration (ms)": total_duration,
|
"Audio Duration (ms)": total_duration,
|
||||||
})
|
})
|
||||||
segment_length = self.max_compression_duration * 1000 # Convert to milliseconds
|
segment_length = self.max_compression_duration * 1000 # Convert to milliseconds
|
||||||
@@ -55,7 +55,7 @@ class AudioProcessor(TranscriptionBaseProcessor):
|
|||||||
compressed_segments = AudioSegment.empty()
|
compressed_segments = AudioSegment.empty()
|
||||||
|
|
||||||
for i in range(total_chunks):
|
for i in range(total_chunks):
|
||||||
self._log_tuning("_compress_audio", {
|
self.log_tuning("_compress_audio", {
|
||||||
"Segment Nr": f"{i + 1} of {total_chunks}"
|
"Segment Nr": f"{i + 1} of {total_chunks}"
|
||||||
})
|
})
|
||||||
|
|
||||||
@@ -87,7 +87,7 @@ class AudioProcessor(TranscriptionBaseProcessor):
|
|||||||
compressed_filename,
|
compressed_filename,
|
||||||
compressed_buffer.read()
|
compressed_buffer.read()
|
||||||
)
|
)
|
||||||
self._log_tuning("_compress_audio", {
|
self.log_tuning("_compress_audio", {
|
||||||
"Compressed audio to MinIO": compressed_filename
|
"Compressed audio to MinIO": compressed_filename
|
||||||
})
|
})
|
||||||
|
|
||||||
@@ -172,14 +172,14 @@ class AudioProcessor(TranscriptionBaseProcessor):
|
|||||||
|
|
||||||
transcriptions.append(trans)
|
transcriptions.append(trans)
|
||||||
|
|
||||||
self._log_tuning("_transcribe_audio", {
|
self.log_tuning("_transcribe_audio", {
|
||||||
"Chunk Nr": f"{i + 1} of {total_chunks}",
|
"Chunk Nr": f"{i + 1} of {total_chunks}",
|
||||||
"Segment Duration": segment_duration,
|
"Segment Duration": segment_duration,
|
||||||
"Transcription": trans,
|
"Transcription": trans,
|
||||||
})
|
})
|
||||||
else:
|
else:
|
||||||
self._log("Warning: Received empty transcription", level='warning')
|
self._log("Warning: Received empty transcription", level='warning')
|
||||||
self._log_tuning("_transcribe_audio", {"ERROR": "No transcription"})
|
self.log_tuning("_transcribe_audio", {"ERROR": "No transcription"})
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self._log(f"Error during transcription: {str(e)}", level='error')
|
self._log(f"Error during transcription: {str(e)}", level='error')
|
||||||
@@ -202,7 +202,7 @@ class AudioProcessor(TranscriptionBaseProcessor):
|
|||||||
transcription_filename,
|
transcription_filename,
|
||||||
full_transcription.encode('utf-8')
|
full_transcription.encode('utf-8')
|
||||||
)
|
)
|
||||||
self._log_tuning(f"Saved transcription to MinIO: {transcription_filename}")
|
self.log_tuning(f"Saved transcription to MinIO: {transcription_filename}")
|
||||||
|
|
||||||
return full_transcription
|
return full_transcription
|
||||||
|
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ class BaseProcessor(ABC):
|
|||||||
self.tuning_logger = None
|
self.tuning_logger = None
|
||||||
self._setup_tuning_logger()
|
self._setup_tuning_logger()
|
||||||
|
|
||||||
self._log_tuning("Processor initialized", {
|
self.log_tuning("Processor initialized", {
|
||||||
"processor_type": processor.type if processor else None,
|
"processor_type": processor.type if processor else None,
|
||||||
"document_version": document_version.id if document_version else None,
|
"document_version": document_version.id if document_version else None,
|
||||||
"catalog": catalog.id if catalog else None
|
"catalog": catalog.id if catalog else None
|
||||||
@@ -42,6 +42,10 @@ class BaseProcessor(ABC):
|
|||||||
def process(self):
|
def process(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@property
|
||||||
|
def configuration(self):
|
||||||
|
return self.processor.configuration
|
||||||
|
|
||||||
def _save_markdown(self, markdown):
|
def _save_markdown(self, markdown):
|
||||||
markdown_filename = f"{self.document_version.id}.md"
|
markdown_filename = f"{self.document_version.id}.md"
|
||||||
minio_client.upload_document_file(
|
minio_client.upload_document_file(
|
||||||
@@ -78,7 +82,7 @@ class BaseProcessor(ABC):
|
|||||||
|
|
||||||
return markdown
|
return markdown
|
||||||
|
|
||||||
def _log_tuning(self, message: str, data: Dict[str, Any] = None) -> None:
|
def log_tuning(self, message: str, data: Dict[str, Any] = None) -> None:
|
||||||
if self.tuning and self.tuning_logger:
|
if self.tuning and self.tuning_logger:
|
||||||
try:
|
try:
|
||||||
self.tuning_logger.log_tuning('processor', message, data)
|
self.tuning_logger.log_tuning('processor', message, data)
|
||||||
|
|||||||
129
eveai_workers/processors/docx_processor.py
Normal file
129
eveai_workers/processors/docx_processor.py
Normal file
@@ -0,0 +1,129 @@
|
|||||||
|
import docx
|
||||||
|
import io
|
||||||
|
from .base_processor import BaseProcessor
|
||||||
|
from .processor_registry import ProcessorRegistry
|
||||||
|
from common.extensions import minio_client
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
class DocxProcessor(BaseProcessor):
|
||||||
|
def __init__(self, tenant, model_variables, document_version, catalog, processor):
|
||||||
|
super().__init__(tenant, model_variables, document_version, catalog, processor)
|
||||||
|
self.config = processor.configuration
|
||||||
|
self.extract_comments = self.config.get('extract_comments', False)
|
||||||
|
self.extract_headers_footers = self.config.get('extract_headers_footers', False)
|
||||||
|
self.preserve_formatting = self.config.get('preserve_formatting', True)
|
||||||
|
self.list_style = self.config.get('list_style', 'dash')
|
||||||
|
self.image_handling = self.config.get('image_handling', 'skip')
|
||||||
|
self.table_alignment = self.config.get('table_alignment', 'left')
|
||||||
|
|
||||||
|
def process(self):
|
||||||
|
try:
|
||||||
|
file_data = minio_client.download_document_file(
|
||||||
|
self.tenant.id,
|
||||||
|
self.document_version.bucket_name,
|
||||||
|
self.document_version.object_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
doc = docx.Document(io.BytesIO(file_data))
|
||||||
|
markdown = self._convert_to_markdown(doc)
|
||||||
|
title = self._extract_title(doc)
|
||||||
|
|
||||||
|
self._save_markdown(markdown)
|
||||||
|
return markdown, title
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self._log(f"Error processing DOCX: {str(e)}", level='error')
|
||||||
|
raise
|
||||||
|
|
||||||
|
def _convert_to_markdown(self, doc):
|
||||||
|
markdown_parts = []
|
||||||
|
|
||||||
|
if self.extract_headers_footers:
|
||||||
|
for section in doc.sections:
|
||||||
|
if section.header.paragraphs:
|
||||||
|
markdown_parts.extend(self._process_paragraphs(section.header.paragraphs))
|
||||||
|
|
||||||
|
markdown_parts.extend(self._process_paragraphs(doc.paragraphs))
|
||||||
|
|
||||||
|
if self.extract_comments and doc.comments:
|
||||||
|
markdown_parts.append("\n## Comments\n")
|
||||||
|
for comment in doc.comments:
|
||||||
|
markdown_parts.append(f"> {comment.text}\n")
|
||||||
|
|
||||||
|
return "\n".join(markdown_parts)
|
||||||
|
|
||||||
|
def _process_paragraphs(self, paragraphs):
|
||||||
|
markdown_parts = []
|
||||||
|
in_list = False
|
||||||
|
|
||||||
|
for para in paragraphs:
|
||||||
|
if not para.text.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
style = para.style.name.lower()
|
||||||
|
|
||||||
|
if 'heading' in style:
|
||||||
|
level = int(style[-1]) if style[-1].isdigit() else 1
|
||||||
|
markdown_parts.append(f"{'#' * level} {para.text}\n")
|
||||||
|
|
||||||
|
elif para._p.pPr and para._p.pPr.numPr: # List item
|
||||||
|
marker = self._get_list_marker()
|
||||||
|
markdown_parts.append(f"{marker} {para.text}\n")
|
||||||
|
in_list = True
|
||||||
|
|
||||||
|
else:
|
||||||
|
if in_list:
|
||||||
|
markdown_parts.append("\n")
|
||||||
|
in_list = False
|
||||||
|
|
||||||
|
text = para.text
|
||||||
|
if self.preserve_formatting:
|
||||||
|
text = self._apply_formatting(para)
|
||||||
|
|
||||||
|
markdown_parts.append(f"{text}\n")
|
||||||
|
|
||||||
|
return markdown_parts
|
||||||
|
|
||||||
|
def _get_list_marker(self):
|
||||||
|
return {
|
||||||
|
'dash': '-',
|
||||||
|
'asterisk': '*',
|
||||||
|
'plus': '+'
|
||||||
|
}.get(self.list_style, '-')
|
||||||
|
|
||||||
|
def _apply_formatting(self, paragraph):
|
||||||
|
text = paragraph.text
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
runs = paragraph.runs
|
||||||
|
formatted_parts = []
|
||||||
|
|
||||||
|
for run in runs:
|
||||||
|
part = run.text
|
||||||
|
if run.bold:
|
||||||
|
part = f"**{part}**"
|
||||||
|
if run.italic:
|
||||||
|
part = f"*{part}*"
|
||||||
|
if run.underline:
|
||||||
|
part = f"__{part}__"
|
||||||
|
formatted_parts.append(part)
|
||||||
|
|
||||||
|
return "".join(formatted_parts)
|
||||||
|
|
||||||
|
def _extract_title(self, doc):
|
||||||
|
if doc.paragraphs:
|
||||||
|
first_para = doc.paragraphs[0]
|
||||||
|
if 'heading' in first_para.style.name.lower():
|
||||||
|
return first_para.text.strip()
|
||||||
|
|
||||||
|
# Look for first Heading 1 in document
|
||||||
|
for para in doc.paragraphs:
|
||||||
|
if para.style.name.lower() == 'heading 1':
|
||||||
|
return para.text.strip()
|
||||||
|
|
||||||
|
return "Untitled Document"
|
||||||
|
|
||||||
|
|
||||||
|
ProcessorRegistry.register("DOCX_PROCESSOR", DocxProcessor)
|
||||||
@@ -24,7 +24,7 @@ class HTMLProcessor(BaseProcessor):
|
|||||||
# Add verification logging
|
# Add verification logging
|
||||||
self._log(f"HTML Processor initialized with tuning={self.tuning}")
|
self._log(f"HTML Processor initialized with tuning={self.tuning}")
|
||||||
if self.tuning:
|
if self.tuning:
|
||||||
self._log_tuning("HTML Processor initialized", {
|
self.log_tuning("HTML Processor initialized", {
|
||||||
"html_tags": self.html_tags,
|
"html_tags": self.html_tags,
|
||||||
"html_end_tags": self.html_end_tags,
|
"html_end_tags": self.html_end_tags,
|
||||||
"included_elements": self.html_included_elements,
|
"included_elements": self.html_included_elements,
|
||||||
@@ -75,7 +75,7 @@ class HTMLProcessor(BaseProcessor):
|
|||||||
title = soup.find('title').get_text(strip=True) if soup.find('title') else ''
|
title = soup.find('title').get_text(strip=True) if soup.find('title') else ''
|
||||||
|
|
||||||
self._log(f'Finished parsing HTML for tenant {self.tenant.id}')
|
self._log(f'Finished parsing HTML for tenant {self.tenant.id}')
|
||||||
self._log_tuning("_parse_html", {"extracted_html": extracted_html, "title": title})
|
self.log_tuning("_parse_html", {"extracted_html": extracted_html, "title": title})
|
||||||
return extracted_html, title
|
return extracted_html, title
|
||||||
|
|
||||||
def _generate_markdown_from_html(self, html_content):
|
def _generate_markdown_from_html(self, html_content):
|
||||||
@@ -96,7 +96,7 @@ class HTMLProcessor(BaseProcessor):
|
|||||||
input_html = {"html": chunk}
|
input_html = {"html": chunk}
|
||||||
markdown_chunk = chain.invoke(input_html)
|
markdown_chunk = chain.invoke(input_html)
|
||||||
markdown_chunks.append(markdown_chunk)
|
markdown_chunks.append(markdown_chunk)
|
||||||
self._log_tuning("_generate_markdown_from_html", {"chunk": chunk, "markdown_chunk": markdown_chunk})
|
self.log_tuning("_generate_markdown_from_html", {"chunk": chunk, "markdown_chunk": markdown_chunk})
|
||||||
|
|
||||||
markdown = "\n\n".join(markdown_chunks)
|
markdown = "\n\n".join(markdown_chunks)
|
||||||
self._log(f'Finished generating markdown from HTML for tenant {self.tenant.id}')
|
self._log(f'Finished generating markdown from HTML for tenant {self.tenant.id}')
|
||||||
|
|||||||
48
eveai_workers/processors/markdown_processor.py
Normal file
48
eveai_workers/processors/markdown_processor.py
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||||
|
from langchain_core.output_parsers import StrOutputParser
|
||||||
|
from langchain_core.prompts import ChatPromptTemplate
|
||||||
|
import re
|
||||||
|
from langchain_core.runnables import RunnablePassthrough
|
||||||
|
|
||||||
|
from common.extensions import minio_client
|
||||||
|
from common.utils.model_utils import create_language_template
|
||||||
|
from .base_processor import BaseProcessor
|
||||||
|
from common.utils.business_event_context import current_event
|
||||||
|
from .processor_registry import ProcessorRegistry
|
||||||
|
|
||||||
|
|
||||||
|
def _find_first_h1(markdown: str) -> str:
|
||||||
|
# Look for # Header (allowing spaces after #)
|
||||||
|
match = re.search(r'^#\s+(.+)$', markdown, re.MULTILINE)
|
||||||
|
return match.group(1).strip() if match else ""
|
||||||
|
|
||||||
|
|
||||||
|
class MarkdownProcessor(BaseProcessor):
|
||||||
|
def __init__(self, tenant, model_variables, document_version, catalog, processor):
|
||||||
|
super().__init__(tenant, model_variables, document_version, catalog, processor)
|
||||||
|
|
||||||
|
self.chunk_size = catalog.max_chunk_size
|
||||||
|
self.chunk_overlap = 0
|
||||||
|
self.tuning = self.processor.tuning
|
||||||
|
|
||||||
|
def process(self):
|
||||||
|
self._log("Starting Markdown processing")
|
||||||
|
try:
|
||||||
|
file_data = minio_client.download_document_file(
|
||||||
|
self.tenant.id,
|
||||||
|
self.document_version.bucket_name,
|
||||||
|
self.document_version.object_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
markdown = file_data.decode('utf-8')
|
||||||
|
title = _find_first_h1(markdown)
|
||||||
|
|
||||||
|
self._save_markdown(markdown)
|
||||||
|
self._log("Finished processing Markdown")
|
||||||
|
return markdown, title
|
||||||
|
except Exception as e:
|
||||||
|
self._log(f"Error processing Markdown: {str(e)}", level='error')
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
ProcessorRegistry.register("MARKDOWN_PROCESSOR", MarkdownProcessor)
|
||||||
@@ -57,7 +57,7 @@ class PDFProcessor(BaseProcessor):
|
|||||||
'figures': self._extract_figures(page, page_num, figure_counter),
|
'figures': self._extract_figures(page, page_num, figure_counter),
|
||||||
'tables': self._extract_tables(page)
|
'tables': self._extract_tables(page)
|
||||||
}
|
}
|
||||||
self._log_tuning("_extract_content", {"page_num": page_num, "page_content": page_content})
|
self.log_tuning("_extract_content", {"page_num": page_num, "page_content": page_content})
|
||||||
figure_counter += len(page_content['figures'])
|
figure_counter += len(page_content['figures'])
|
||||||
extracted_content.append(page_content)
|
extracted_content.append(page_content)
|
||||||
|
|
||||||
@@ -119,7 +119,7 @@ class PDFProcessor(BaseProcessor):
|
|||||||
markdown_table = self._table_to_markdown(table)
|
markdown_table = self._table_to_markdown(table)
|
||||||
if markdown_table: # Only add non-empty tables
|
if markdown_table: # Only add non-empty tables
|
||||||
tables.append(markdown_table)
|
tables.append(markdown_table)
|
||||||
self._log_tuning("_extract_tables", {"markdown_table": markdown_table})
|
self.log_tuning("_extract_tables", {"markdown_table": markdown_table})
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self._log(f"Error extracting tables from page: {str(e)}", level='error')
|
self._log(f"Error extracting tables from page: {str(e)}", level='error')
|
||||||
return tables
|
return tables
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ class TranscriptionBaseProcessor(BaseProcessor):
|
|||||||
return text_splitter.split_text(transcription)
|
return text_splitter.split_text(transcription)
|
||||||
|
|
||||||
def _process_chunks(self, chunks):
|
def _process_chunks(self, chunks):
|
||||||
self._log_tuning("_process_chunks", {"Nr of Chunks": len(chunks)})
|
self.log_tuning("_process_chunks", {"Nr of Chunks": len(chunks)})
|
||||||
llm = self.model_variables.get_llm()
|
llm = self.model_variables.get_llm()
|
||||||
template = self.model_variables.get_template('transcript')
|
template = self.model_variables.get_template('transcript')
|
||||||
language_template = create_language_template(template, self.document_version.language)
|
language_template = create_language_template(template, self.document_version.language)
|
||||||
@@ -64,7 +64,7 @@ class TranscriptionBaseProcessor(BaseProcessor):
|
|||||||
}
|
}
|
||||||
markdown = chain.invoke(input_transcript)
|
markdown = chain.invoke(input_transcript)
|
||||||
markdown = self._clean_markdown(markdown)
|
markdown = self._clean_markdown(markdown)
|
||||||
self._log_tuning("_process_chunks", {
|
self.log_tuning("_process_chunks", {
|
||||||
"Chunk Number": f"{i + 1} of {len(chunks)}",
|
"Chunk Number": f"{i + 1} of {len(chunks)}",
|
||||||
"Chunk": chunk,
|
"Chunk": chunk,
|
||||||
"Previous Chunk": previous_part,
|
"Previous Chunk": previous_part,
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import re
|
||||||
from datetime import datetime as dt, timezone as tz
|
from datetime import datetime as dt, timezone as tz
|
||||||
|
|
||||||
from celery import states
|
from celery import states
|
||||||
@@ -23,6 +24,8 @@ from common.utils.business_event_context import current_event
|
|||||||
from config.type_defs.processor_types import PROCESSOR_TYPES
|
from config.type_defs.processor_types import PROCESSOR_TYPES
|
||||||
from eveai_workers.processors.processor_registry import ProcessorRegistry
|
from eveai_workers.processors.processor_registry import ProcessorRegistry
|
||||||
|
|
||||||
|
from common.utils.config_field_types import json_to_pattern_list
|
||||||
|
|
||||||
|
|
||||||
# Healthcheck task
|
# Healthcheck task
|
||||||
@current_celery.task(name='ping', queue='embeddings')
|
@current_celery.task(name='ping', queue='embeddings')
|
||||||
@@ -99,9 +102,13 @@ def create_embeddings(tenant_id, document_version_id):
|
|||||||
processor=processor
|
processor=processor
|
||||||
)
|
)
|
||||||
markdown, title = document_processor.process()
|
markdown, title = document_processor.process()
|
||||||
|
document_processor.log_tuning("Processor returned: ", {
|
||||||
|
'markdown': markdown,
|
||||||
|
'title': title
|
||||||
|
})
|
||||||
|
|
||||||
with current_event.create_span("Embedding"):
|
with current_event.create_span("Embedding"):
|
||||||
embed_markdown(tenant, model_variables, document_version, catalog, markdown, title)
|
embed_markdown(tenant, model_variables, document_version, catalog, document_processor, markdown, title)
|
||||||
|
|
||||||
current_event.log("Finished Embedding Creation Task")
|
current_event.log("Finished Embedding Creation Task")
|
||||||
|
|
||||||
@@ -129,16 +136,19 @@ def delete_embeddings_for_document_version(document_version):
|
|||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
def embed_markdown(tenant, model_variables, document_version, catalog, markdown, title):
|
def embed_markdown(tenant, model_variables, document_version, catalog, processor, markdown, title):
|
||||||
# Create potential chunks
|
# Create potential chunks
|
||||||
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, f"{document_version.id}.md")
|
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, processor, markdown)
|
||||||
|
processor.log_tuning("Potential Chunks: ", {'potential chunks': potential_chunks})
|
||||||
|
|
||||||
# Combine chunks for embedding
|
# Combine chunks for embedding
|
||||||
chunks = combine_chunks_for_markdown(potential_chunks, catalog.min_chunk_size, catalog.max_chunk_size)
|
chunks = combine_chunks_for_markdown(potential_chunks, catalog.min_chunk_size, catalog.max_chunk_size, processor)
|
||||||
|
processor.log_tuning("Chunks: ", {'chunks': chunks})
|
||||||
|
|
||||||
# Enrich chunks
|
# Enrich chunks
|
||||||
with current_event.create_span("Enrich Chunks"):
|
with current_event.create_span("Enrich Chunks"):
|
||||||
enriched_chunks = enrich_chunks(tenant, model_variables, document_version, title, chunks)
|
enriched_chunks = enrich_chunks(tenant, model_variables, document_version, title, chunks)
|
||||||
|
processor.log_tuning("Enriched Chunks: ", {'enriched_chunks': enriched_chunks})
|
||||||
|
|
||||||
# Create embeddings
|
# Create embeddings
|
||||||
with current_event.create_span("Create Embeddings"):
|
with current_event.create_span("Create Embeddings"):
|
||||||
@@ -238,23 +248,17 @@ def embed_chunks(tenant, model_variables, document_version, chunks):
|
|||||||
return new_embeddings
|
return new_embeddings
|
||||||
|
|
||||||
|
|
||||||
def create_potential_chunks_for_markdown(tenant_id, document_version, input_file):
|
def create_potential_chunks_for_markdown(tenant_id, document_version, processor, markdown):
|
||||||
try:
|
try:
|
||||||
current_app.logger.info(f'Creating potential chunks for tenant {tenant_id}')
|
current_app.logger.info(f'Creating potential chunks for tenant {tenant_id}')
|
||||||
markdown_on = document_version.object_name.rsplit('.', 1)[0] + '.md'
|
heading_level = processor.configuration.get('chunking_heading_level', 2)
|
||||||
|
|
||||||
# Download the markdown file from MinIO
|
|
||||||
markdown_data = minio_client.download_document_file(tenant_id,
|
|
||||||
document_version.bucket_name,
|
|
||||||
markdown_on,
|
|
||||||
)
|
|
||||||
markdown = markdown_data.decode('utf-8')
|
|
||||||
|
|
||||||
headers_to_split_on = [
|
headers_to_split_on = [
|
||||||
("#", "Header 1"),
|
(f"{'#' * i}", f"Header {i}") for i in range(1, min(heading_level + 1, 7))
|
||||||
("##", "Header 2"),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
processor.log_tuning('Headers to split on', {'header list: ': headers_to_split_on})
|
||||||
|
|
||||||
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
|
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
|
||||||
md_header_splits = markdown_splitter.split_text(markdown)
|
md_header_splits = markdown_splitter.split_text(markdown)
|
||||||
potential_chunks = [doc.page_content for doc in md_header_splits]
|
potential_chunks = [doc.page_content for doc in md_header_splits]
|
||||||
@@ -265,14 +269,61 @@ def create_potential_chunks_for_markdown(tenant_id, document_version, input_file
|
|||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):
|
def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars, processor):
|
||||||
actual_chunks = []
|
actual_chunks = []
|
||||||
current_chunk = ""
|
current_chunk = ""
|
||||||
current_length = 0
|
current_length = 0
|
||||||
|
|
||||||
|
def matches_chunking_pattern(text, patterns):
|
||||||
|
if not patterns:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Get the first line of the text
|
||||||
|
first_line = text.split('\n', 1)[0].strip()
|
||||||
|
|
||||||
|
# Check if it's a header at appropriate level
|
||||||
|
header_match = re.match(r'^(#{1,6})\s+(.+)$', first_line)
|
||||||
|
if not header_match:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Get the heading level (number of #s)
|
||||||
|
header_level = len(header_match.group(1))
|
||||||
|
# Get the header text
|
||||||
|
header_text = header_match.group(2)
|
||||||
|
|
||||||
|
# Check if header matches any pattern
|
||||||
|
for pattern in patterns:
|
||||||
|
try:
|
||||||
|
processor.log_tuning('Pattern check: ', {
|
||||||
|
'pattern: ': pattern,
|
||||||
|
'text': header_text
|
||||||
|
})
|
||||||
|
if re.search(pattern, header_text, re.IGNORECASE):
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
current_app.logger.warning(f"Invalid regex pattern '{pattern}': {str(e)}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
chunking_patterns = json_to_pattern_list(processor.configuration.get('chunking_patterns', []))
|
||||||
|
|
||||||
|
processor.log_tuning(f'Chunking Patterns Extraction: ', {
|
||||||
|
'Full Configuration': processor.configuration,
|
||||||
|
'Chunking Patterns': chunking_patterns,
|
||||||
|
})
|
||||||
|
|
||||||
for chunk in potential_chunks:
|
for chunk in potential_chunks:
|
||||||
chunk_length = len(chunk)
|
chunk_length = len(chunk)
|
||||||
|
|
||||||
|
# Force new chunk if pattern matches
|
||||||
|
if chunking_patterns and matches_chunking_pattern(chunk, chunking_patterns):
|
||||||
|
if current_chunk and current_length >= min_chars:
|
||||||
|
actual_chunks.append(current_chunk)
|
||||||
|
current_chunk = chunk
|
||||||
|
current_length = chunk_length
|
||||||
|
continue
|
||||||
|
|
||||||
if current_length + chunk_length > max_chars:
|
if current_length + chunk_length > max_chars:
|
||||||
if current_length >= min_chars:
|
if current_length >= min_chars:
|
||||||
actual_chunks.append(current_chunk)
|
actual_chunks.append(current_chunk)
|
||||||
|
|||||||
@@ -89,3 +89,4 @@ prometheus_flask_exporter~=0.23.1
|
|||||||
prometheus_client~=0.20.0
|
prometheus_client~=0.20.0
|
||||||
babel~=2.16.0
|
babel~=2.16.0
|
||||||
dogpile.cache~=1.3.3
|
dogpile.cache~=1.3.3
|
||||||
|
python-docx~=1.1.2
|
||||||
|
|||||||
Reference in New Issue
Block a user