- Addition of general chunking parameters chunking_heading_level and chunking patterns

- Addition of Processor types docx and markdown
2024-12-05 15:19:37 +01:00
parent 311927d5ea
commit d35ec9f5ae
17 changed files with 718 additions and 66 deletions
--- a/common/utils/document_utils.py
+++ b/common/utils/document_utils.py
@@ -12,7 +12,7 @@ import requests
 from urllib.parse import urlparse, unquote, urlunparse
 import os
 from .eveai_exceptions import (EveAIInvalidLanguageException, EveAIDoubleURLException, EveAIUnsupportedFileType,
-                               EveAIInvalidCatalog, EveAIInvalidDocument, EveAIInvalidDocumentVersion)
+                               EveAIInvalidCatalog, EveAIInvalidDocument, EveAIInvalidDocumentVersion, EveAIException)
 from ..models.user import Tenant


@@ -219,12 +219,6 @@ def start_embedding_task(tenant_id, doc_vers_id):
    return task.id


-def validate_file_type(extension):
-    if extension not in current_app.config['SUPPORTED_FILE_TYPES']:
-        raise EveAIUnsupportedFileType(f"Filetype {extension} is currently not supported. "
-                                       f"Supported filetypes: {', '.join(current_app.config['SUPPORTED_FILE_TYPES'])}")
-
-
 def get_filename_from_url(url):
    parsed_url = urlparse(url)
    path_parts = parsed_url.path.split('/')
@@ -363,3 +357,109 @@ def cope_with_local_url(url):
    return url


+def lookup_document(tenant_id: int, lookup_criteria: dict, metadata_type: str) -> tuple[Document, DocumentVersion]:
+    """
+    Look up a document using metadata criteria
+
+    Args:
+        tenant_id: ID of the tenant
+        lookup_criteria: Dictionary of key-value pairs to match in metadata
+        metadata_type: Which metadata to search in ('user_metadata' or 'system_metadata')
+
+    Returns:
+        Tuple of (Document, DocumentVersion) if found
+
+    Raises:
+        ValueError: If invalid metadata_type provided
+        EveAIException: If lookup fails
+    """
+    if metadata_type not in ['user_metadata', 'system_metadata']:
+        raise ValueError(f"Invalid metadata_type: {metadata_type}")
+
+    try:
+        # Query for the latest document version matching the criteria
+        query = (db.session.query(Document, DocumentVersion)
+                 .join(DocumentVersion)
+                 .filter(Document.id == DocumentVersion.doc_id)
+                 .order_by(DocumentVersion.id.desc()))
+
+        # Add metadata filtering using PostgreSQL JSONB operators
+        metadata_field = getattr(DocumentVersion, metadata_type)
+        for key, value in lookup_criteria.items():
+            query = query.filter(metadata_field[key].astext == str(value))
+
+        # Get first result
+        result = query.first()
+
+        if not result:
+            raise EveAIException(
+                f"No document found matching criteria in {metadata_type}",
+                status_code=404
+            )
+
+        return result
+
+    except SQLAlchemyError as e:
+        current_app.logger.error(f'Database error during document lookup for tenant {tenant_id}: {e}')
+        raise EveAIException(
+            "Database error during document lookup",
+            status_code=500
+        )
+    except Exception as e:
+        current_app.logger.error(f'Error during document lookup for tenant {tenant_id}: {e}')
+        raise EveAIException(
+            "Error during document lookup",
+            status_code=500
+        )
+
+
+# Add to common/utils/document_utils.py
+
+def refresh_document_with_content(doc_id: int, tenant_id: int, file_content: bytes, api_input: dict) -> tuple:
+    """
+    Refresh document with new content
+
+    Args:
+        doc_id: Document ID
+        tenant_id: Tenant ID
+        file_content: New file content
+        api_input: Additional document information
+
+    Returns:
+        Tuple of (new_version, task_id)
+    """
+    doc = Document.query.get(doc_id)
+    if not doc:
+        raise EveAIInvalidDocument(tenant_id, doc_id)
+
+    old_doc_vers = DocumentVersion.query.filter_by(doc_id=doc_id).order_by(desc(DocumentVersion.id)).first()
+
+    # Create new version with same file type as original
+    extension = old_doc_vers.file_type
+
+    new_doc_vers = create_version_for_document(
+        doc, tenant_id,
+        '',  # No URL for content-based updates
+        old_doc_vers.sub_file_type,
+        api_input.get('language', old_doc_vers.language),
+        api_input.get('user_context', old_doc_vers.user_context),
+        api_input.get('user_metadata', old_doc_vers.user_metadata),
+        api_input.get('catalog_properties', old_doc_vers.catalog_properties),
+    )
+
+    try:
+        db.session.add(new_doc_vers)
+        db.session.commit()
+    except SQLAlchemyError as e:
+        db.session.rollback()
+        return None, str(e)
+
+    # Upload new content
+    upload_file_for_version(new_doc_vers, file_content, extension, tenant_id)
+
+    # Start embedding task
+    task = current_celery.send_task('create_embeddings', args=[tenant_id, new_doc_vers.id], queue='embeddings')
+    current_app.logger.info(f'Embedding creation started for document {doc_id} on version {new_doc_vers.id} '
+                            f'with task id: {task.id}.')
+
+    return new_doc_vers, task.id