- Full API application, streamlined, de-duplication of document handling code into document_utils.py

- Added meta-data fields to DocumentVersion
- Docker container to support API
This commit is contained in:
Josako
2024-09-09 16:11:42 +02:00
parent 341ba47d1c
commit 76cb825660
39 changed files with 598 additions and 6177 deletions

View File

@@ -9,7 +9,7 @@ from flask_socketio import SocketIO
from flask_jwt_extended import JWTManager
from flask_session import Session
from flask_wtf import CSRFProtect
from flask_restful import Api
from flask_restx import Api
from .utils.nginx_utils import prefixed_url_for
from .utils.simple_encryption import SimpleEncryption
@@ -28,9 +28,6 @@ cors = CORS()
socketio = SocketIO()
jwt = JWTManager()
session = Session()
api = Api()
# kms_client = JosKMSClient.from_service_account_json('config/gc_sa_eveai.json')
api_rest = Api()
simple_encryption = SimpleEncryption()
minio_client = MinioClient()

View File

@@ -1,6 +1,7 @@
from common.extensions import db
from .user import User, Tenant
from pgvector.sqlalchemy import Vector
from sqlalchemy.dialects.postgresql import JSONB
class Document(db.Model):
@@ -33,6 +34,8 @@ class DocumentVersion(db.Model):
language = db.Column(db.String(2), nullable=False)
user_context = db.Column(db.Text, nullable=True)
system_context = db.Column(db.Text, nullable=True)
user_metadata = db.Column(JSONB, nullable=True)
system_metadata = db.Column(JSONB, nullable=True)
# Versioning Information
created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now())

View File

@@ -1,4 +1,6 @@
from datetime import datetime as dt, timezone as tz
from sqlalchemy import desc
from sqlalchemy.exc import SQLAlchemyError
from werkzeug.utils import secure_filename
from common.models.document import Document, DocumentVersion
@@ -9,8 +11,7 @@ from flask_security import current_user
import requests
from urllib.parse import urlparse, unquote
import os
from .eveai_exceptions import EveAIInvalidLanguageException, EveAIDoubleURLException, EveAIUnsupportedFileType, \
EveAIYoutubeError
from .eveai_exceptions import EveAIInvalidLanguageException, EveAIDoubleURLException, EveAIUnsupportedFileType
def create_document_stack(api_input, file, filename, extension, tenant_id):
@@ -22,7 +23,8 @@ def create_document_stack(api_input, file, filename, extension, tenant_id):
new_doc_vers = create_version_for_document(new_doc,
api_input.get('url', ''),
api_input.get('language', 'en'),
api_input.get('user_context', '')
api_input.get('user_context', ''),
api_input.get('user_metadata'),
)
db.session.add(new_doc_vers)
@@ -59,7 +61,7 @@ def create_document(form, filename, tenant_id):
return new_doc
def create_version_for_document(document, url, language, user_context):
def create_version_for_document(document, url, language, user_context, user_metadata):
new_doc_vers = DocumentVersion()
if url != '':
new_doc_vers.url = url
@@ -72,6 +74,9 @@ def create_version_for_document(document, url, language, user_context):
if user_context != '':
new_doc_vers.user_context = user_context
if user_metadata != '' and user_metadata is not None:
new_doc_vers.user_metadata = user_metadata
new_doc_vers.document = document
set_logging_information(new_doc_vers, dt.now(tz.utc))
@@ -211,27 +216,6 @@ def process_multiple_urls(urls, tenant_id, api_input):
return results
def prepare_youtube_document(url, tenant_id, api_input):
try:
filename = f"placeholder.youtube"
extension = 'youtube'
new_doc = create_document(api_input, filename, tenant_id)
new_doc_vers = create_version_for_document(new_doc, url, api_input['language'], api_input['user_context'])
new_doc_vers.file_type = extension
new_doc_vers.file_name = new_doc_vers.calc_file_name()
new_doc_vers.file_location = new_doc_vers.calc_file_location()
db.session.add(new_doc)
db.session.add(new_doc_vers)
db.session.commit()
return new_doc, new_doc_vers
except Exception as e:
raise EveAIYoutubeError(f"Error preparing YouTube document: {str(e)}")
def start_embedding_task(tenant_id, doc_vers_id):
task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
tenant_id,
@@ -249,3 +233,88 @@ def validate_file_type(extension):
if extension not in current_app.config['SUPPORTED_FILE_TYPES']:
raise EveAIUnsupportedFileType(f"Filetype {extension} is currently not supported. "
f"Supported filetypes: {', '.join(current_app.config['SUPPORTED_FILE_TYPES'])}")
def get_filename_from_url(url):
parsed_url = urlparse(url)
path_parts = parsed_url.path.split('/')
filename = path_parts[-1]
if filename == '':
filename = 'index'
if not filename.endswith('.html'):
filename += '.html'
return filename
def get_documents_list(page, per_page):
query = Document.query.order_by(desc(Document.created_at))
pagination = query.paginate(page=page, per_page=per_page, error_out=False)
return pagination
def edit_document(document_id, name, valid_from, valid_to):
doc = Document.query.get_or_404(document_id)
doc.name = name
doc.valid_from = valid_from
doc.valid_to = valid_to
update_logging_information(doc, dt.now(tz.utc))
try:
db.session.add(doc)
db.session.commit()
return doc, None
except SQLAlchemyError as e:
db.session.rollback()
return None, str(e)
def edit_document_version(version_id, user_context):
doc_vers = DocumentVersion.query.get_or_404(version_id)
doc_vers.user_context = user_context
update_logging_information(doc_vers, dt.now(tz.utc))
try:
db.session.add(doc_vers)
db.session.commit()
return doc_vers, None
except SQLAlchemyError as e:
db.session.rollback()
return None, str(e)
def refresh_document(doc_id):
doc = Document.query.get_or_404(doc_id)
doc_vers = DocumentVersion.query.filter_by(doc_id=doc_id).order_by(desc(DocumentVersion.id)).first()
if not doc_vers.url:
return None, f"This document {doc_id} has no URL. Only documents with a URL can be refreshed."
new_doc_vers = create_version_for_document(doc, doc_vers.url, doc_vers.language, doc_vers.user_context,
doc_vers.user_metadata)
try:
db.session.add(new_doc_vers)
db.session.commit()
except SQLAlchemyError as e:
db.session.rollback()
return None, str(e)
response = requests.head(doc_vers.url, allow_redirects=True)
content_type = response.headers.get('Content-Type', '').split(';')[0]
# Determine file extension based on Content-Type
extension = get_extension_from_content_type(content_type)
# Download the content
response = requests.get(doc_vers.url)
response.raise_for_status()
file_content = response.content
upload_file_for_version(new_doc_vers, file_content, extension, doc.tenant_id)
task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
doc.tenant_id,
new_doc_vers.id,
])
return new_doc_vers, task.id

View File

@@ -34,10 +34,3 @@ class EveAIUnsupportedFileType(EveAIException):
super().__init__(message, status_code, payload)
class EveAIYoutubeError(EveAIException):
"""Raised when adding a Youtube document fails"""
def __init__(self, message="Youtube document creation failed", status_code=400, payload=None):
super().__init__(message, status_code, payload)
# Add more custom exceptions as needed