- License Usage Calculation realised
- View License Usages - Celery Beat container added - First schedule in Celery Beat for calculating usage (hourly) - repopack can now split for different components - Various fixes as consequece of changing file_location / file_name ==> bucket_name / object_name - Celery Routing / Queuing updated
This commit is contained in:
@@ -27,10 +27,8 @@ class AudioProcessor(TranscriptionProcessor):
|
||||
def _get_transcription(self):
|
||||
file_data = minio_client.download_document_file(
|
||||
self.tenant.id,
|
||||
self.document_version.doc_id,
|
||||
self.document_version.language,
|
||||
self.document_version.id,
|
||||
self.document_version.file_name
|
||||
self.document_version.bucket_name,
|
||||
self.document_version.object_name,
|
||||
)
|
||||
|
||||
with current_event.create_span("Audio Compression"):
|
||||
|
||||
@@ -24,10 +24,8 @@ class HTMLProcessor(Processor):
|
||||
try:
|
||||
file_data = minio_client.download_document_file(
|
||||
self.tenant.id,
|
||||
self.document_version.doc_id,
|
||||
self.document_version.language,
|
||||
self.document_version.id,
|
||||
self.document_version.file_name
|
||||
self.document_version.bucket_name,
|
||||
self.document_version.object_name,
|
||||
)
|
||||
html_content = file_data.decode('utf-8')
|
||||
|
||||
|
||||
@@ -27,10 +27,8 @@ class PDFProcessor(Processor):
|
||||
try:
|
||||
file_data = minio_client.download_document_file(
|
||||
self.tenant.id,
|
||||
self.document_version.doc_id,
|
||||
self.document_version.language,
|
||||
self.document_version.id,
|
||||
self.document_version.file_name
|
||||
self.document_version.bucket_name,
|
||||
self.document_version.object_name,
|
||||
)
|
||||
|
||||
with current_event.create_span("PDF Extraction"):
|
||||
|
||||
@@ -7,10 +7,8 @@ class SRTProcessor(TranscriptionProcessor):
|
||||
def _get_transcription(self):
|
||||
file_data = minio_client.download_document_file(
|
||||
self.tenant.id,
|
||||
self.document_version.doc_id,
|
||||
self.document_version.language,
|
||||
self.document_version.id,
|
||||
self.document_version.file_name
|
||||
self.document_version.bucket_name,
|
||||
self.document_version.object_name,
|
||||
)
|
||||
srt_content = file_data.decode('utf-8')
|
||||
return self._clean_srt(srt_content)
|
||||
|
||||
@@ -44,3 +44,4 @@ def register_extensions(app):
|
||||
|
||||
|
||||
app, celery = create_app()
|
||||
|
||||
|
||||
@@ -36,34 +36,36 @@ def ping():
|
||||
|
||||
@current_celery.task(name='create_embeddings', queue='embeddings')
|
||||
def create_embeddings(tenant_id, document_version_id):
|
||||
# Retrieve document version to process
|
||||
document_version = DocumentVersion.query.get(document_version_id)
|
||||
if document_version is None:
|
||||
raise Exception(f'Document version {document_version_id} not found')
|
||||
try:
|
||||
# Retrieve Tenant for which we are processing
|
||||
tenant = Tenant.query.get(tenant_id)
|
||||
if tenant is None:
|
||||
raise Exception(f'Tenant {tenant_id} not found')
|
||||
|
||||
# Ensure we are working in the correct database schema
|
||||
Database(tenant_id).switch_schema()
|
||||
|
||||
# Retrieve document version to process
|
||||
document_version = DocumentVersion.query.get(document_version_id)
|
||||
if document_version is None:
|
||||
raise Exception(f'Document version {document_version_id} not found')
|
||||
|
||||
# Select variables to work with depending on tenant and model
|
||||
model_variables = select_model_variables(tenant)
|
||||
current_app.logger.debug(f'Model variables: {model_variables}')
|
||||
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Create Embeddings request received '
|
||||
f'for non existing document version {document_version_id} '
|
||||
f'for tenant {tenant_id}, '
|
||||
f'error: {e}')
|
||||
raise
|
||||
|
||||
# BusinessEvent creates a context, which is why we need to use it with a with block
|
||||
with BusinessEvent('Create Embeddings', tenant_id,
|
||||
document_version_id=document_version_id,
|
||||
document_version_file_size=document_version.file_size):
|
||||
current_app.logger.info(f'Creating embeddings for tenant {tenant_id} on document version {document_version_id}')
|
||||
try:
|
||||
# Retrieve Tenant for which we are processing
|
||||
tenant = Tenant.query.get(tenant_id)
|
||||
if tenant is None:
|
||||
raise Exception(f'Tenant {tenant_id} not found')
|
||||
|
||||
# Ensure we are working in the correct database schema
|
||||
Database(tenant_id).switch_schema()
|
||||
|
||||
# Select variables to work with depending on tenant and model
|
||||
model_variables = select_model_variables(tenant)
|
||||
current_app.logger.debug(f'Model variables: {model_variables}')
|
||||
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Create Embeddings request received '
|
||||
f'for non existing document version {document_version_id} '
|
||||
f'for tenant {tenant_id}, '
|
||||
f'error: {e}')
|
||||
raise
|
||||
|
||||
try:
|
||||
db.session.add(document_version)
|
||||
@@ -204,7 +206,7 @@ def enrich_chunks(tenant, model_variables, document_version, title, chunks):
|
||||
if len(chunks) > 1:
|
||||
summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
|
||||
|
||||
chunk_total_context = (f'Filename: {document_version.file_name}\n'
|
||||
chunk_total_context = (f'Filename: {document_version.object_name}\n'
|
||||
f'User Context:\n{document_version.user_context}\n\n'
|
||||
f'User Metadata:\n{document_version.user_metadata}\n\n'
|
||||
f'Title: {title}\n'
|
||||
@@ -213,7 +215,7 @@ def enrich_chunks(tenant, model_variables, document_version, title, chunks):
|
||||
f'System Metadata:\n{document_version.system_metadata}\n\n'
|
||||
)
|
||||
enriched_chunks = []
|
||||
initial_chunk = (f'Filename: {document_version.file_name}\n'
|
||||
initial_chunk = (f'Filename: {document_version.object_name}\n'
|
||||
f'User Context:\n{document_version.user_context}\n\n'
|
||||
f'User Metadata:\n{document_version.user_metadata}\n\n'
|
||||
f'Title: {title}\n'
|
||||
@@ -304,13 +306,12 @@ def log_parsing_info(tenant, tags, included_elements, excluded_elements, exclude
|
||||
def create_potential_chunks_for_markdown(tenant_id, document_version, input_file):
|
||||
try:
|
||||
current_app.logger.info(f'Creating potential chunks for tenant {tenant_id}')
|
||||
markdown_on = document_version.object_name.rsplit('.', 1)[0] + '.md'
|
||||
|
||||
# Download the markdown file from MinIO
|
||||
markdown_data = minio_client.download_document_file(tenant_id,
|
||||
document_version.doc_id,
|
||||
document_version.language,
|
||||
document_version.id,
|
||||
input_file
|
||||
document_version.bucket_name,
|
||||
markdown_on,
|
||||
)
|
||||
markdown = markdown_data.decode('utf-8')
|
||||
|
||||
|
||||
Reference in New Issue
Block a user