- View License Usages - Celery Beat container added - First schedule in Celery Beat for calculating usage (hourly) - repopack can now split for different components - Various fixes as consequece of changing file_location / file_name ==> bucket_name / object_name - Celery Routing / Queuing updated
350 lines
12 KiB
Python
350 lines
12 KiB
Python
from datetime import datetime as dt, timezone as tz
|
|
|
|
from sqlalchemy import desc
|
|
from sqlalchemy.exc import SQLAlchemyError
|
|
from werkzeug.utils import secure_filename
|
|
from common.models.document import Document, DocumentVersion
|
|
from common.extensions import db, minio_client
|
|
from common.utils.celery_utils import current_celery
|
|
from flask import current_app
|
|
from flask_security import current_user
|
|
import requests
|
|
from urllib.parse import urlparse, unquote
|
|
import os
|
|
from .eveai_exceptions import EveAIInvalidLanguageException, EveAIDoubleURLException, EveAIUnsupportedFileType
|
|
from ..models.user import Tenant
|
|
|
|
|
|
def create_document_stack(api_input, file, filename, extension, tenant_id):
|
|
# Create the Document
|
|
new_doc = create_document(api_input, filename, tenant_id)
|
|
db.session.add(new_doc)
|
|
|
|
# Create the DocumentVersion
|
|
new_doc_vers = create_version_for_document(new_doc,
|
|
api_input.get('url', ''),
|
|
api_input.get('language', 'en'),
|
|
api_input.get('user_context', ''),
|
|
api_input.get('user_metadata'),
|
|
)
|
|
db.session.add(new_doc_vers)
|
|
|
|
try:
|
|
db.session.commit()
|
|
except SQLAlchemyError as e:
|
|
current_app.logger.error(f'Error adding document for tenant {tenant_id}: {e}')
|
|
db.session.rollback()
|
|
raise
|
|
|
|
current_app.logger.info(f'Document added successfully for tenant {tenant_id}, '
|
|
f'Document Version {new_doc.id}')
|
|
|
|
# Upload file to storage
|
|
upload_file_for_version(new_doc_vers, file, extension, tenant_id)
|
|
|
|
return new_doc, new_doc_vers
|
|
|
|
|
|
def create_document(form, filename, tenant_id):
|
|
new_doc = Document()
|
|
if form['name'] == '':
|
|
new_doc.name = filename.rsplit('.', 1)[0]
|
|
else:
|
|
new_doc.name = form['name']
|
|
|
|
if form['valid_from'] and form['valid_from'] != '':
|
|
new_doc.valid_from = form['valid_from']
|
|
else:
|
|
new_doc.valid_from = dt.now(tz.utc)
|
|
new_doc.tenant_id = tenant_id
|
|
set_logging_information(new_doc, dt.now(tz.utc))
|
|
|
|
return new_doc
|
|
|
|
|
|
def create_version_for_document(document, url, language, user_context, user_metadata):
|
|
new_doc_vers = DocumentVersion()
|
|
if url != '':
|
|
new_doc_vers.url = url
|
|
|
|
if language == '':
|
|
raise EveAIInvalidLanguageException('Language is required for document creation!')
|
|
else:
|
|
new_doc_vers.language = language
|
|
|
|
if user_context != '':
|
|
new_doc_vers.user_context = user_context
|
|
|
|
if user_metadata != '' and user_metadata is not None:
|
|
new_doc_vers.user_metadata = user_metadata
|
|
|
|
new_doc_vers.document = document
|
|
|
|
set_logging_information(new_doc_vers, dt.now(tz.utc))
|
|
|
|
mark_tenant_storage_dirty(document.tenant_id)
|
|
|
|
return new_doc_vers
|
|
|
|
|
|
def upload_file_for_version(doc_vers, file, extension, tenant_id):
|
|
doc_vers.file_type = extension
|
|
|
|
# Normally, the tenant bucket should exist. But let's be on the safe side if a migration took place.
|
|
minio_client.create_tenant_bucket(tenant_id)
|
|
|
|
try:
|
|
bn, on, size = minio_client.upload_document_file(
|
|
tenant_id,
|
|
doc_vers.doc_id,
|
|
doc_vers.language,
|
|
doc_vers.id,
|
|
f"{doc_vers.id}.{extension}",
|
|
file
|
|
)
|
|
doc_vers.bucket_name = bn
|
|
doc_vers.object_name = on
|
|
doc_vers.file_size = size / 1048576 # Convert bytes to MB
|
|
|
|
db.session.commit()
|
|
current_app.logger.info(f'Successfully saved document to MinIO for tenant {tenant_id} for '
|
|
f'document version {doc_vers.id} while uploading file.')
|
|
except Exception as e:
|
|
db.session.rollback()
|
|
current_app.logger.error(
|
|
f'Error saving document to MinIO for tenant {tenant_id}: {e}')
|
|
raise
|
|
|
|
|
|
def set_logging_information(obj, timestamp):
|
|
obj.created_at = timestamp
|
|
obj.updated_at = timestamp
|
|
|
|
user_id = get_current_user_id()
|
|
if user_id:
|
|
obj.created_by = user_id
|
|
obj.updated_by = user_id
|
|
|
|
|
|
def update_logging_information(obj, timestamp):
|
|
obj.updated_at = timestamp
|
|
|
|
user_id = get_current_user_id()
|
|
if user_id:
|
|
obj.updated_by = user_id
|
|
|
|
|
|
def get_current_user_id():
|
|
try:
|
|
if current_user and current_user.is_authenticated:
|
|
return current_user.id
|
|
else:
|
|
return None
|
|
except Exception:
|
|
# This will catch any errors if current_user is not available (e.g., in API context)
|
|
return None
|
|
|
|
|
|
def get_extension_from_content_type(content_type):
|
|
content_type_map = {
|
|
'text/html': 'html',
|
|
'application/pdf': 'pdf',
|
|
'text/plain': 'txt',
|
|
'application/msword': 'doc',
|
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
|
|
# Add more mappings as needed
|
|
}
|
|
return content_type_map.get(content_type, 'html') # Default to 'html' if unknown
|
|
|
|
|
|
def process_url(url, tenant_id):
|
|
response = requests.head(url, allow_redirects=True)
|
|
content_type = response.headers.get('Content-Type', '').split(';')[0]
|
|
|
|
# Determine file extension based on Content-Type
|
|
extension = get_extension_from_content_type(content_type)
|
|
|
|
# Generate filename
|
|
parsed_url = urlparse(url)
|
|
path = unquote(parsed_url.path)
|
|
filename = os.path.basename(path)
|
|
|
|
if not filename or '.' not in filename:
|
|
# Use the last part of the path or a default name
|
|
filename = path.strip('/').split('/')[-1] or 'document'
|
|
filename = secure_filename(f"{filename}.{extension}")
|
|
else:
|
|
filename = secure_filename(filename)
|
|
|
|
# Check if a document with this URL already exists
|
|
existing_doc = DocumentVersion.query.filter_by(url=url).first()
|
|
if existing_doc:
|
|
raise EveAIDoubleURLException
|
|
|
|
# Download the content
|
|
response = requests.get(url)
|
|
response.raise_for_status()
|
|
file_content = response.content
|
|
|
|
return file_content, filename, extension
|
|
|
|
|
|
def process_multiple_urls(urls, tenant_id, api_input):
|
|
results = []
|
|
for url in urls:
|
|
try:
|
|
file_content, filename, extension = process_url(url, tenant_id)
|
|
|
|
url_input = api_input.copy()
|
|
url_input.update({
|
|
'url': url,
|
|
'name': f"{api_input['name']}-{filename}" if api_input['name'] else filename
|
|
})
|
|
|
|
new_doc, new_doc_vers = create_document_stack(url_input, file_content, filename, extension, tenant_id)
|
|
task_id = start_embedding_task(tenant_id, new_doc_vers.id)
|
|
|
|
results.append({
|
|
'url': url,
|
|
'document_id': new_doc.id,
|
|
'document_version_id': new_doc_vers.id,
|
|
'task_id': task_id,
|
|
'status': 'success'
|
|
})
|
|
except Exception as e:
|
|
current_app.logger.error(f"Error processing URL {url}: {str(e)}")
|
|
results.append({
|
|
'url': url,
|
|
'status': 'error',
|
|
'message': str(e)
|
|
})
|
|
return results
|
|
|
|
|
|
def start_embedding_task(tenant_id, doc_vers_id):
|
|
task = current_celery.send_task('create_embeddings',
|
|
args=[tenant_id, doc_vers_id,],
|
|
queue='embeddings')
|
|
current_app.logger.info(f'Embedding creation started for tenant {tenant_id}, '
|
|
f'Document Version {doc_vers_id}. '
|
|
f'Embedding creation task: {task.id}')
|
|
return task.id
|
|
|
|
|
|
def validate_file_type(extension):
|
|
current_app.logger.debug(f'Validating file type {extension}')
|
|
current_app.logger.debug(f'Supported file types: {current_app.config["SUPPORTED_FILE_TYPES"]}')
|
|
if extension not in current_app.config['SUPPORTED_FILE_TYPES']:
|
|
raise EveAIUnsupportedFileType(f"Filetype {extension} is currently not supported. "
|
|
f"Supported filetypes: {', '.join(current_app.config['SUPPORTED_FILE_TYPES'])}")
|
|
|
|
|
|
def get_filename_from_url(url):
|
|
parsed_url = urlparse(url)
|
|
path_parts = parsed_url.path.split('/')
|
|
filename = path_parts[-1]
|
|
if filename == '':
|
|
filename = 'index'
|
|
if not filename.endswith('.html'):
|
|
filename += '.html'
|
|
return filename
|
|
|
|
|
|
def get_documents_list(page, per_page):
|
|
query = Document.query.order_by(desc(Document.created_at))
|
|
pagination = query.paginate(page=page, per_page=per_page, error_out=False)
|
|
return pagination
|
|
|
|
|
|
def edit_document(document_id, name, valid_from, valid_to):
|
|
doc = Document.query.get_or_404(document_id)
|
|
doc.name = name
|
|
doc.valid_from = valid_from
|
|
doc.valid_to = valid_to
|
|
update_logging_information(doc, dt.now(tz.utc))
|
|
|
|
try:
|
|
db.session.add(doc)
|
|
db.session.commit()
|
|
return doc, None
|
|
except SQLAlchemyError as e:
|
|
db.session.rollback()
|
|
return None, str(e)
|
|
|
|
|
|
def edit_document_version(version_id, user_context):
|
|
doc_vers = DocumentVersion.query.get_or_404(version_id)
|
|
doc_vers.user_context = user_context
|
|
update_logging_information(doc_vers, dt.now(tz.utc))
|
|
|
|
try:
|
|
db.session.add(doc_vers)
|
|
db.session.commit()
|
|
return doc_vers, None
|
|
except SQLAlchemyError as e:
|
|
db.session.rollback()
|
|
return None, str(e)
|
|
|
|
|
|
def refresh_document_with_info(doc_id, api_input):
|
|
doc = Document.query.get_or_404(doc_id)
|
|
old_doc_vers = DocumentVersion.query.filter_by(doc_id=doc_id).order_by(desc(DocumentVersion.id)).first()
|
|
|
|
if not old_doc_vers.url:
|
|
return None, "This document has no URL. Only documents with a URL can be refreshed."
|
|
|
|
new_doc_vers = create_version_for_document(
|
|
doc,
|
|
old_doc_vers.url,
|
|
api_input.get('language', old_doc_vers.language),
|
|
api_input.get('user_context', old_doc_vers.user_context),
|
|
api_input.get('user_metadata', old_doc_vers.user_metadata)
|
|
)
|
|
|
|
set_logging_information(new_doc_vers, dt.now(tz.utc))
|
|
|
|
try:
|
|
db.session.add(new_doc_vers)
|
|
db.session.commit()
|
|
except SQLAlchemyError as e:
|
|
db.session.rollback()
|
|
return None, str(e)
|
|
|
|
response = requests.head(old_doc_vers.url, allow_redirects=True)
|
|
content_type = response.headers.get('Content-Type', '').split(';')[0]
|
|
extension = get_extension_from_content_type(content_type)
|
|
|
|
response = requests.get(old_doc_vers.url)
|
|
response.raise_for_status()
|
|
file_content = response.content
|
|
|
|
upload_file_for_version(new_doc_vers, file_content, extension, doc.tenant_id)
|
|
|
|
task = current_celery.send_task('create_embeddings', args=[doc.tenant_id, new_doc_vers.id,], queue='embeddings')
|
|
current_app.logger.info(f'Embedding creation started for document {doc_id} on version {new_doc_vers.id} '
|
|
f'with task id: {task.id}.')
|
|
|
|
return new_doc_vers, task.id
|
|
|
|
|
|
# Update the existing refresh_document function to use the new refresh_document_with_info
|
|
def refresh_document(doc_id):
|
|
current_app.logger.info(f'Refreshing document {doc_id}')
|
|
doc = Document.query.get_or_404(doc_id)
|
|
old_doc_vers = DocumentVersion.query.filter_by(doc_id=doc_id).order_by(desc(DocumentVersion.id)).first()
|
|
|
|
api_input = {
|
|
'language': old_doc_vers.language,
|
|
'user_context': old_doc_vers.user_context,
|
|
'user_metadata': old_doc_vers.user_metadata
|
|
}
|
|
|
|
return refresh_document_with_info(doc_id, api_input)
|
|
|
|
|
|
# Function triggered when a document_version is created or updated
|
|
def mark_tenant_storage_dirty(tenant_id):
|
|
tenant = db.session.query(Tenant).filter_by(id=tenant_id).first()
|
|
tenant.storage_dirty = True
|
|
db.session.commit()
|