- License Usage Calculation realised

- View License Usages - Celery Beat container added - First schedule in Celery Beat for calculating usage (hourly) - repopack can now split for different components - Various fixes as consequece of changing file_location / file_name ==> bucket_name / object_name - Celery Routing / Queuing updated
2024-10-11 16:33:36 +02:00
parent 5ffad160b1
commit 9f5f090f0c
57 changed files with 935 additions and 174 deletions
--- a/eveai_workers/Processors/audio_processor.py
+++ b/eveai_workers/Processors/audio_processor.py
@@ -27,10 +27,8 @@ class AudioProcessor(TranscriptionProcessor):
    def _get_transcription(self):
        file_data = minio_client.download_document_file(
            self.tenant.id,
-            self.document_version.doc_id,
-            self.document_version.language,
-            self.document_version.id,
-            self.document_version.file_name
+            self.document_version.bucket_name,
+            self.document_version.object_name,
        )

        with current_event.create_span("Audio Compression"):
--- a/eveai_workers/Processors/html_processor.py
+++ b/eveai_workers/Processors/html_processor.py
@@ -24,10 +24,8 @@ class HTMLProcessor(Processor):
        try:
            file_data = minio_client.download_document_file(
                self.tenant.id,
-                self.document_version.doc_id,
-                self.document_version.language,
-                self.document_version.id,
-                self.document_version.file_name
+                self.document_version.bucket_name,
+                self.document_version.object_name,
            )
            html_content = file_data.decode('utf-8')

--- a/eveai_workers/Processors/pdf_processor.py
+++ b/eveai_workers/Processors/pdf_processor.py
@@ -27,10 +27,8 @@ class PDFProcessor(Processor):
        try:
            file_data = minio_client.download_document_file(
                self.tenant.id,
-                self.document_version.doc_id,
-                self.document_version.language,
-                self.document_version.id,
-                self.document_version.file_name
+                self.document_version.bucket_name,
+                self.document_version.object_name,
            )

            with current_event.create_span("PDF Extraction"):
--- a/eveai_workers/Processors/srt_processor.py
+++ b/eveai_workers/Processors/srt_processor.py
@@ -7,10 +7,8 @@ class SRTProcessor(TranscriptionProcessor):
    def _get_transcription(self):
        file_data = minio_client.download_document_file(
            self.tenant.id,
-            self.document_version.doc_id,
-            self.document_version.language,
-            self.document_version.id,
-            self.document_version.file_name
+            self.document_version.bucket_name,
+            self.document_version.object_name,
        )
        srt_content = file_data.decode('utf-8')
        return self._clean_srt(srt_content)
--- a/eveai_workers/init.py
+++ b/eveai_workers/init.py
@@ -44,3 +44,4 @@ def register_extensions(app):


 app, celery = create_app()
+
--- a/eveai_workers/tasks.py
+++ b/eveai_workers/tasks.py
@@ -36,34 +36,36 @@ def ping():

@current_celery.task(name='create_embeddings', queue='embeddings')
 def create_embeddings(tenant_id, document_version_id):
-    # Retrieve document version to process
-    document_version = DocumentVersion.query.get(document_version_id)
-    if document_version is None:
-        raise Exception(f'Document version {document_version_id} not found')
+    try:
+        # Retrieve Tenant for which we are processing
+        tenant = Tenant.query.get(tenant_id)
+        if tenant is None:
+            raise Exception(f'Tenant {tenant_id} not found')
+
+        # Ensure we are working in the correct database schema
+        Database(tenant_id).switch_schema()
+
+        # Retrieve document version to process
+        document_version = DocumentVersion.query.get(document_version_id)
+        if document_version is None:
+            raise Exception(f'Document version {document_version_id} not found')
+
+        # Select variables to work with depending on tenant and model
+        model_variables = select_model_variables(tenant)
+        current_app.logger.debug(f'Model variables: {model_variables}')
+
+    except Exception as e:
+        current_app.logger.error(f'Create Embeddings request received '
+                                 f'for non existing document version {document_version_id} '
+                                 f'for tenant {tenant_id}, '
+                                 f'error: {e}')
+        raise
+
    # BusinessEvent creates a context, which is why we need to use it with a with block
    with BusinessEvent('Create Embeddings', tenant_id,
                       document_version_id=document_version_id,
                       document_version_file_size=document_version.file_size):
        current_app.logger.info(f'Creating embeddings for tenant {tenant_id} on document version {document_version_id}')
-        try:
-            # Retrieve Tenant for which we are processing
-            tenant = Tenant.query.get(tenant_id)
-            if tenant is None:
-                raise Exception(f'Tenant {tenant_id} not found')
-
-            # Ensure we are working in the correct database schema
-            Database(tenant_id).switch_schema()
-
-            # Select variables to work with depending on tenant and model
-            model_variables = select_model_variables(tenant)
-            current_app.logger.debug(f'Model variables: {model_variables}')
-
-        except Exception as e:
-            current_app.logger.error(f'Create Embeddings request received '
-                                     f'for non existing document version {document_version_id} '
-                                     f'for tenant {tenant_id}, '
-                                     f'error: {e}')
-            raise

        try:
            db.session.add(document_version)
@@ -204,7 +206,7 @@ def enrich_chunks(tenant, model_variables, document_version, title, chunks):
    if len(chunks) > 1:
        summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])

-    chunk_total_context = (f'Filename: {document_version.file_name}\n'
+    chunk_total_context = (f'Filename: {document_version.object_name}\n'
                           f'User Context:\n{document_version.user_context}\n\n'
                           f'User Metadata:\n{document_version.user_metadata}\n\n'
                           f'Title: {title}\n'
@@ -213,7 +215,7 @@ def enrich_chunks(tenant, model_variables, document_version, title, chunks):
                           f'System Metadata:\n{document_version.system_metadata}\n\n'
                           )
    enriched_chunks = []
-    initial_chunk = (f'Filename: {document_version.file_name}\n'
+    initial_chunk = (f'Filename: {document_version.object_name}\n'
                     f'User Context:\n{document_version.user_context}\n\n'
                     f'User Metadata:\n{document_version.user_metadata}\n\n'
                     f'Title: {title}\n'
@@ -304,13 +306,12 @@ def log_parsing_info(tenant, tags, included_elements, excluded_elements, exclude
 def create_potential_chunks_for_markdown(tenant_id, document_version, input_file):
    try:
        current_app.logger.info(f'Creating potential chunks for tenant {tenant_id}')
+        markdown_on = document_version.object_name.rsplit('.', 1)[0] + '.md'

        # Download the markdown file from MinIO
        markdown_data = minio_client.download_document_file(tenant_id,
-                                                            document_version.doc_id,
-                                                            document_version.language,
-                                                            document_version.id,
-                                                            input_file
+                                                            document_version.bucket_name,
+                                                            markdown_on,
                                                            )
        markdown = markdown_data.decode('utf-8')
				`@@ -44,3 +44,4 @@ def register_extensions(app):`


				`app, celery = create_app()`