- Addition of general chunking parameters chunking_heading_level and chunking patterns
- Addition of Processor types docx and markdown
This commit is contained in:
@@ -12,7 +12,7 @@ import requests
|
||||
from urllib.parse import urlparse, unquote, urlunparse
|
||||
import os
|
||||
from .eveai_exceptions import (EveAIInvalidLanguageException, EveAIDoubleURLException, EveAIUnsupportedFileType,
|
||||
EveAIInvalidCatalog, EveAIInvalidDocument, EveAIInvalidDocumentVersion)
|
||||
EveAIInvalidCatalog, EveAIInvalidDocument, EveAIInvalidDocumentVersion, EveAIException)
|
||||
from ..models.user import Tenant
|
||||
|
||||
|
||||
@@ -219,12 +219,6 @@ def start_embedding_task(tenant_id, doc_vers_id):
|
||||
return task.id
|
||||
|
||||
|
||||
def validate_file_type(extension):
|
||||
if extension not in current_app.config['SUPPORTED_FILE_TYPES']:
|
||||
raise EveAIUnsupportedFileType(f"Filetype {extension} is currently not supported. "
|
||||
f"Supported filetypes: {', '.join(current_app.config['SUPPORTED_FILE_TYPES'])}")
|
||||
|
||||
|
||||
def get_filename_from_url(url):
|
||||
parsed_url = urlparse(url)
|
||||
path_parts = parsed_url.path.split('/')
|
||||
@@ -363,3 +357,109 @@ def cope_with_local_url(url):
|
||||
return url
|
||||
|
||||
|
||||
def lookup_document(tenant_id: int, lookup_criteria: dict, metadata_type: str) -> tuple[Document, DocumentVersion]:
|
||||
"""
|
||||
Look up a document using metadata criteria
|
||||
|
||||
Args:
|
||||
tenant_id: ID of the tenant
|
||||
lookup_criteria: Dictionary of key-value pairs to match in metadata
|
||||
metadata_type: Which metadata to search in ('user_metadata' or 'system_metadata')
|
||||
|
||||
Returns:
|
||||
Tuple of (Document, DocumentVersion) if found
|
||||
|
||||
Raises:
|
||||
ValueError: If invalid metadata_type provided
|
||||
EveAIException: If lookup fails
|
||||
"""
|
||||
if metadata_type not in ['user_metadata', 'system_metadata']:
|
||||
raise ValueError(f"Invalid metadata_type: {metadata_type}")
|
||||
|
||||
try:
|
||||
# Query for the latest document version matching the criteria
|
||||
query = (db.session.query(Document, DocumentVersion)
|
||||
.join(DocumentVersion)
|
||||
.filter(Document.id == DocumentVersion.doc_id)
|
||||
.order_by(DocumentVersion.id.desc()))
|
||||
|
||||
# Add metadata filtering using PostgreSQL JSONB operators
|
||||
metadata_field = getattr(DocumentVersion, metadata_type)
|
||||
for key, value in lookup_criteria.items():
|
||||
query = query.filter(metadata_field[key].astext == str(value))
|
||||
|
||||
# Get first result
|
||||
result = query.first()
|
||||
|
||||
if not result:
|
||||
raise EveAIException(
|
||||
f"No document found matching criteria in {metadata_type}",
|
||||
status_code=404
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
except SQLAlchemyError as e:
|
||||
current_app.logger.error(f'Database error during document lookup for tenant {tenant_id}: {e}')
|
||||
raise EveAIException(
|
||||
"Database error during document lookup",
|
||||
status_code=500
|
||||
)
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Error during document lookup for tenant {tenant_id}: {e}')
|
||||
raise EveAIException(
|
||||
"Error during document lookup",
|
||||
status_code=500
|
||||
)
|
||||
|
||||
|
||||
# Add to common/utils/document_utils.py
|
||||
|
||||
def refresh_document_with_content(doc_id: int, tenant_id: int, file_content: bytes, api_input: dict) -> tuple:
|
||||
"""
|
||||
Refresh document with new content
|
||||
|
||||
Args:
|
||||
doc_id: Document ID
|
||||
tenant_id: Tenant ID
|
||||
file_content: New file content
|
||||
api_input: Additional document information
|
||||
|
||||
Returns:
|
||||
Tuple of (new_version, task_id)
|
||||
"""
|
||||
doc = Document.query.get(doc_id)
|
||||
if not doc:
|
||||
raise EveAIInvalidDocument(tenant_id, doc_id)
|
||||
|
||||
old_doc_vers = DocumentVersion.query.filter_by(doc_id=doc_id).order_by(desc(DocumentVersion.id)).first()
|
||||
|
||||
# Create new version with same file type as original
|
||||
extension = old_doc_vers.file_type
|
||||
|
||||
new_doc_vers = create_version_for_document(
|
||||
doc, tenant_id,
|
||||
'', # No URL for content-based updates
|
||||
old_doc_vers.sub_file_type,
|
||||
api_input.get('language', old_doc_vers.language),
|
||||
api_input.get('user_context', old_doc_vers.user_context),
|
||||
api_input.get('user_metadata', old_doc_vers.user_metadata),
|
||||
api_input.get('catalog_properties', old_doc_vers.catalog_properties),
|
||||
)
|
||||
|
||||
try:
|
||||
db.session.add(new_doc_vers)
|
||||
db.session.commit()
|
||||
except SQLAlchemyError as e:
|
||||
db.session.rollback()
|
||||
return None, str(e)
|
||||
|
||||
# Upload new content
|
||||
upload_file_for_version(new_doc_vers, file_content, extension, tenant_id)
|
||||
|
||||
# Start embedding task
|
||||
task = current_celery.send_task('create_embeddings', args=[tenant_id, new_doc_vers.id], queue='embeddings')
|
||||
current_app.logger.info(f'Embedding creation started for document {doc_id} on version {new_doc_vers.id} '
|
||||
f'with task id: {task.id}.')
|
||||
|
||||
return new_doc_vers, task.id
|
||||
|
||||
Reference in New Issue
Block a user