- Full API application, streamlined, de-duplication of document handling code into document_utils.py
- Added meta-data fields to DocumentVersion - Docker container to support API
This commit is contained in:
@@ -1,4 +1,6 @@
|
||||
from datetime import datetime as dt, timezone as tz
|
||||
|
||||
from sqlalchemy import desc
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from werkzeug.utils import secure_filename
|
||||
from common.models.document import Document, DocumentVersion
|
||||
@@ -9,8 +11,7 @@ from flask_security import current_user
|
||||
import requests
|
||||
from urllib.parse import urlparse, unquote
|
||||
import os
|
||||
from .eveai_exceptions import EveAIInvalidLanguageException, EveAIDoubleURLException, EveAIUnsupportedFileType, \
|
||||
EveAIYoutubeError
|
||||
from .eveai_exceptions import EveAIInvalidLanguageException, EveAIDoubleURLException, EveAIUnsupportedFileType
|
||||
|
||||
|
||||
def create_document_stack(api_input, file, filename, extension, tenant_id):
|
||||
@@ -22,7 +23,8 @@ def create_document_stack(api_input, file, filename, extension, tenant_id):
|
||||
new_doc_vers = create_version_for_document(new_doc,
|
||||
api_input.get('url', ''),
|
||||
api_input.get('language', 'en'),
|
||||
api_input.get('user_context', '')
|
||||
api_input.get('user_context', ''),
|
||||
api_input.get('user_metadata'),
|
||||
)
|
||||
db.session.add(new_doc_vers)
|
||||
|
||||
@@ -59,7 +61,7 @@ def create_document(form, filename, tenant_id):
|
||||
return new_doc
|
||||
|
||||
|
||||
def create_version_for_document(document, url, language, user_context):
|
||||
def create_version_for_document(document, url, language, user_context, user_metadata):
|
||||
new_doc_vers = DocumentVersion()
|
||||
if url != '':
|
||||
new_doc_vers.url = url
|
||||
@@ -72,6 +74,9 @@ def create_version_for_document(document, url, language, user_context):
|
||||
if user_context != '':
|
||||
new_doc_vers.user_context = user_context
|
||||
|
||||
if user_metadata != '' and user_metadata is not None:
|
||||
new_doc_vers.user_metadata = user_metadata
|
||||
|
||||
new_doc_vers.document = document
|
||||
|
||||
set_logging_information(new_doc_vers, dt.now(tz.utc))
|
||||
@@ -211,27 +216,6 @@ def process_multiple_urls(urls, tenant_id, api_input):
|
||||
return results
|
||||
|
||||
|
||||
def prepare_youtube_document(url, tenant_id, api_input):
|
||||
try:
|
||||
filename = f"placeholder.youtube"
|
||||
extension = 'youtube'
|
||||
|
||||
new_doc = create_document(api_input, filename, tenant_id)
|
||||
new_doc_vers = create_version_for_document(new_doc, url, api_input['language'], api_input['user_context'])
|
||||
|
||||
new_doc_vers.file_type = extension
|
||||
new_doc_vers.file_name = new_doc_vers.calc_file_name()
|
||||
new_doc_vers.file_location = new_doc_vers.calc_file_location()
|
||||
|
||||
db.session.add(new_doc)
|
||||
db.session.add(new_doc_vers)
|
||||
db.session.commit()
|
||||
|
||||
return new_doc, new_doc_vers
|
||||
except Exception as e:
|
||||
raise EveAIYoutubeError(f"Error preparing YouTube document: {str(e)}")
|
||||
|
||||
|
||||
def start_embedding_task(tenant_id, doc_vers_id):
|
||||
task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
|
||||
tenant_id,
|
||||
@@ -249,3 +233,88 @@ def validate_file_type(extension):
|
||||
if extension not in current_app.config['SUPPORTED_FILE_TYPES']:
|
||||
raise EveAIUnsupportedFileType(f"Filetype {extension} is currently not supported. "
|
||||
f"Supported filetypes: {', '.join(current_app.config['SUPPORTED_FILE_TYPES'])}")
|
||||
|
||||
|
||||
def get_filename_from_url(url):
|
||||
parsed_url = urlparse(url)
|
||||
path_parts = parsed_url.path.split('/')
|
||||
filename = path_parts[-1]
|
||||
if filename == '':
|
||||
filename = 'index'
|
||||
if not filename.endswith('.html'):
|
||||
filename += '.html'
|
||||
return filename
|
||||
|
||||
|
||||
def get_documents_list(page, per_page):
|
||||
query = Document.query.order_by(desc(Document.created_at))
|
||||
pagination = query.paginate(page=page, per_page=per_page, error_out=False)
|
||||
return pagination
|
||||
|
||||
|
||||
def edit_document(document_id, name, valid_from, valid_to):
|
||||
doc = Document.query.get_or_404(document_id)
|
||||
doc.name = name
|
||||
doc.valid_from = valid_from
|
||||
doc.valid_to = valid_to
|
||||
update_logging_information(doc, dt.now(tz.utc))
|
||||
|
||||
try:
|
||||
db.session.add(doc)
|
||||
db.session.commit()
|
||||
return doc, None
|
||||
except SQLAlchemyError as e:
|
||||
db.session.rollback()
|
||||
return None, str(e)
|
||||
|
||||
|
||||
def edit_document_version(version_id, user_context):
|
||||
doc_vers = DocumentVersion.query.get_or_404(version_id)
|
||||
doc_vers.user_context = user_context
|
||||
update_logging_information(doc_vers, dt.now(tz.utc))
|
||||
|
||||
try:
|
||||
db.session.add(doc_vers)
|
||||
db.session.commit()
|
||||
return doc_vers, None
|
||||
except SQLAlchemyError as e:
|
||||
db.session.rollback()
|
||||
return None, str(e)
|
||||
|
||||
|
||||
def refresh_document(doc_id):
|
||||
doc = Document.query.get_or_404(doc_id)
|
||||
doc_vers = DocumentVersion.query.filter_by(doc_id=doc_id).order_by(desc(DocumentVersion.id)).first()
|
||||
|
||||
if not doc_vers.url:
|
||||
return None, f"This document {doc_id} has no URL. Only documents with a URL can be refreshed."
|
||||
|
||||
new_doc_vers = create_version_for_document(doc, doc_vers.url, doc_vers.language, doc_vers.user_context,
|
||||
doc_vers.user_metadata)
|
||||
|
||||
try:
|
||||
db.session.add(new_doc_vers)
|
||||
db.session.commit()
|
||||
except SQLAlchemyError as e:
|
||||
db.session.rollback()
|
||||
return None, str(e)
|
||||
|
||||
response = requests.head(doc_vers.url, allow_redirects=True)
|
||||
content_type = response.headers.get('Content-Type', '').split(';')[0]
|
||||
|
||||
# Determine file extension based on Content-Type
|
||||
extension = get_extension_from_content_type(content_type)
|
||||
|
||||
# Download the content
|
||||
response = requests.get(doc_vers.url)
|
||||
response.raise_for_status()
|
||||
file_content = response.content
|
||||
|
||||
upload_file_for_version(new_doc_vers, file_content, extension, doc.tenant_id)
|
||||
|
||||
task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
|
||||
doc.tenant_id,
|
||||
new_doc_vers.id,
|
||||
])
|
||||
|
||||
return new_doc_vers, task.id
|
||||
|
||||
Reference in New Issue
Block a user