- Improvements on document uploads (accept other files than html-files when entering a URL)

- Introduction of API-functionality (to be continued). Deduplication of document and url uploads between views and api. - Improvements on document processing - introduction of processor classes to streamline document inputs - Removed pure Youtube functionality, as Youtube retrieval of documents continuously changes. But added upload of srt, mp3, ogg and mp4
2024-09-02 12:37:44 +02:00
parent a158655247
commit 914c265afe
21 changed files with 1425 additions and 852 deletions
--- a/eveai_app/init.py
+++ b/eveai_app/init.py
@@ -27,7 +27,6 @@ def create_app(config_file=None):
    app.wsgi_app = ProxyFix(app.wsgi_app, x_for=1, x_proto=1, x_host=1, x_port=1)

    environment = os.getenv('FLASK_ENV', 'development')
-    print(environment)

    match environment:
        case 'development':
@@ -49,8 +48,6 @@ def create_app(config_file=None):
    logger = logging.getLogger(__name__)

    logger.info("eveai_app starting up")
-    logger.debug("start config")
-    logger.debug(app.config)

    # Register extensions

@@ -95,14 +92,11 @@ def create_app(config_file=None):
        }
        return jsonify(response), 500

-    @app.before_request
-    def before_request():
-        # app.logger.debug(f"Before request - Session ID: {session.sid}")
-        app.logger.debug(f"Before request - Session data: {session}")
-        app.logger.debug(f"Before request - Request headers: {request.headers}")
-
-    # Register API
-    register_api(app)
+    # @app.before_request
+    # def before_request():
+    #     # app.logger.debug(f"Before request - Session ID: {session.sid}")
+    #     app.logger.debug(f"Before request - Session data: {session}")
+    #     app.logger.debug(f"Before request - Request headers: {request.headers}")

    # Register template filters
    register_filters(app)
@@ -138,9 +132,3 @@ def register_blueprints(app):
    app.register_blueprint(security_bp)
    from .views.interaction_views import interaction_bp
    app.register_blueprint(interaction_bp)
-
-
-def register_api(app):
-    pass
-    # from . import api
-    # app.register_blueprint(api.bp, url_prefix='/api')
--- a/eveai_app/templates/navbar.html
+++ b/eveai_app/templates/navbar.html
@@ -84,7 +84,6 @@
                                    {'name': 'Add Document', 'url': '/document/add_document', 'roles': ['Super User', 'Tenant Admin']},
                                    {'name': 'Add URL', 'url': '/document/add_url', 'roles': ['Super User', 'Tenant Admin']},
                                    {'name': 'Add a list of URLs', 'url': '/document/add_urls', 'roles': ['Super User', 'Tenant Admin']},
-                                    {'name': 'Add Youtube Document' , 'url': '/document/add_youtube', 'roles': ['Super User', 'Tenant Admin']},
                                    {'name': 'All Documents', 'url': '/document/documents', 'roles': ['Super User', 'Tenant Admin']},
                                    {'name': 'All Document Versions', 'url': '/document/document_versions_list', 'roles': ['Super User', 'Tenant Admin']},
                                    {'name': 'Library Operations', 'url': '/document/library_operations', 'roles': ['Super User', 'Tenant Admin']},
--- a/eveai_app/views/document_forms.py
+++ b/eveai_app/views/document_forms.py
@@ -1,14 +1,21 @@
-from flask import session
+from flask import session, current_app
 from flask_wtf import FlaskForm
 from wtforms import (StringField, BooleanField, SubmitField, DateField,
                     SelectField, FieldList, FormField, TextAreaField, URLField)
-from wtforms.validators import DataRequired, Length, Optional, URL
+from wtforms.validators import DataRequired, Length, Optional, URL, ValidationError
 from flask_wtf.file import FileField, FileAllowed, FileRequired


+def allowed_file(form, field):
+    if field.data:
+        filename = field.data.filename
+        allowed_extensions = current_app.config.get('SUPPORTED_FILE_TYPES', [])
+        if not ('.' in filename and filename.rsplit('.', 1)[1].lower() in allowed_extensions):
+            raise ValidationError('Unsupported file type.')
+
+
 class AddDocumentForm(FlaskForm):
-    file = FileField('File', validators=[FileAllowed(['pdf', 'txt', 'html']),
-                                         FileRequired()])
+    file = FileField('File', validators=[FileRequired(), allowed_file])
    name = StringField('Name', validators=[Length(max=100)])
    language = SelectField('Language', choices=[], validators=[Optional()])
    user_context = TextAreaField('User Context', validators=[Optional()])
--- a/eveai_app/views/document_views.py
+++ b/eveai_app/views/document_views.py
@@ -18,6 +18,10 @@ from minio.error import S3Error

 from common.models.document import Document, DocumentVersion
 from common.extensions import db, minio_client
+from common.utils.document_utils import validate_file_type, create_document_stack, start_embedding_task, process_url, \
+    process_multiple_urls, prepare_youtube_document, create_version_for_document, upload_file_for_version
+from common.utils.eveai_exceptions import EveAIInvalidLanguageException, EveAIUnsupportedFileType, \
+    EveAIDoubleURLException, EveAIYoutubeError
 from .document_forms import AddDocumentForm, AddURLForm, EditDocumentForm, EditDocumentVersionForm, AddYoutubeForm, \
    AddURLsForm
 from common.utils.middleware import mw_before_request
@@ -56,30 +60,37 @@ def before_request():
@roles_accepted('Super User', 'Tenant Admin')
 def add_document():
    form = AddDocumentForm()
+    current_app.logger.debug('Adding document')

-    # If the form is submitted
    if form.validate_on_submit():
-        current_app.logger.info(f'Adding document for tenant {session["tenant"]["id"]}')
-        file = form.file.data
-        filename = secure_filename(file.filename)
-        extension = filename.rsplit('.', 1)[1].lower()
-        form_dict = form_to_dict(form)
+        try:
+            current_app.logger.debug('Validating file type')
+            tenant_id = session['tenant']['id']
+            file = form.file.data
+            filename = secure_filename(file.filename)
+            extension = filename.rsplit('.', 1)[1].lower()

-        new_doc, new_doc_vers = create_document_stack(form_dict, file, filename, extension)
+            validate_file_type(extension)

-        task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
-            session['tenant']['id'],
-            new_doc_vers.id,
-        ])
-        current_app.logger.info(f'Embedding creation started for tenant {session["tenant"]["id"]}, '
-                                f'Document Version {new_doc_vers.id}. '
-                                f'Embedding creation task: {task.id}')
-        flash(f'Processing on document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task.id}.',
-              'success')
+            api_input = {
+                'name': form.name.data,
+                'language': form.language.data,
+                'user_context': form.user_context.data,
+                'valid_from': form.valid_from.data
+            }

-        return redirect(prefixed_url_for('document_bp.documents'))
-    else:
-        form_validation_failed(request, form)
+            new_doc, new_doc_vers = create_document_stack(api_input, file, filename, extension, tenant_id)
+            task_id = start_embedding_task(tenant_id, new_doc_vers.id)
+
+            flash(f'Processing on document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task_id}.',
+                  'success')
+            return redirect(prefixed_url_for('document_bp.documents'))
+
+        except (EveAIInvalidLanguageException, EveAIUnsupportedFileType) as e:
+            flash(str(e), 'error')
+        except Exception as e:
+            current_app.logger.error(f'Error adding document: {str(e)}')
+            flash('An error occurred while adding the document.', 'error')

    return render_template('document/add_document.html', form=form)

@@ -90,189 +101,107 @@ def add_url():
    form = AddURLForm()

    if form.validate_on_submit():
-        current_app.logger.info(f'Adding url for tenant {session["tenant"]["id"]}')
-        url = form.url.data
-
        try:
-            response = requests.head(url, allow_redirects=True)
-            content_type = response.headers.get('Content-Type', '').split(';')[0]
+            tenant_id = session['tenant']['id']
+            url = form.url.data

-            # Determine file extension based on Content-Type
-            extension = get_extension_from_content_type(content_type)
+            file_content, filename, extension = process_url(url, tenant_id)

-            # Generate filename
-            parsed_url = urlparse(url)
-            path = unquote(parsed_url.path)
-            filename = os.path.basename(path)
+            api_input = {
+                'name': form.name.data or filename,
+                'url': url,
+                'language': form.language.data,
+                'user_context': form.user_context.data,
+                'valid_from': form.valid_from.data
+            }

-            if not filename or '.' not in filename:
-                # Use the last part of the path or a default name
-                filename = path.strip('/').split('/')[-1] or 'document'
-                filename = secure_filename(f"{filename}.{extension}")
-            else:
-                filename = secure_filename(filename)
+            new_doc, new_doc_vers = create_document_stack(api_input, file_content, filename, extension, tenant_id)
+            task_id = start_embedding_task(tenant_id, new_doc_vers.id)

-            # Check if a document with this URL already exists
-            existing_doc = DocumentVersion.query.filter_by(url=url).first()
-            if existing_doc:
-                flash(f'A document with URL {url} already exists. No new document created.', 'info')
-                return redirect(prefixed_url_for('document_bp.documents'))
-
-            # Download the content
-            response = requests.get(url)
-            response.raise_for_status()
-            file_content = response.content
-
-            # Create document and document version
-            form_dict = form_to_dict(form)
-            new_doc, new_doc_vers = create_document_stack(form_dict, file_content, filename, extension)
-
-            # Upload file to storage
-            minio_client.upload_document_file(
-                session['tenant']['id'],
-                new_doc_vers.doc_id,
-                new_doc_vers.language,
-                new_doc_vers.id,
-                filename,
-                file_content
-            )
-
-            # Start embedding task
-            task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
-                session['tenant']['id'],
-                new_doc_vers.id,
-            ])
-
-            current_app.logger.info(f'Embedding creation started for tenant {session["tenant"]["id"]}, '
-                                    f'Document Version {new_doc_vers.id}. '
-                                    f'Embedding creation task: {task.id}')
-            flash(f'Processing on document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task.id}.',
+            flash(f'Processing on document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task_id}.',
                  'success')
-
            return redirect(prefixed_url_for('document_bp.documents'))

-        except requests.RequestException as e:
-            current_app.logger.error(f'Error fetching URL {url}: {str(e)}')
-            flash(f'Error fetching URL: {str(e)}', 'danger')
-        except SQLAlchemyError as e:
-            current_app.logger.error(f'Database error: {str(e)}')
-            flash('An error occurred while saving the document.', 'danger')
+        except EveAIDoubleURLException:
+            flash(f'A document with url {url} already exists. No new document created.', 'info')
+        except (EveAIInvalidLanguageException, EveAIUnsupportedFileType) as e:
+            flash(str(e), 'error')
        except Exception as e:
-            current_app.logger.error(f'Unexpected error: {str(e)}')
-            flash('An unexpected error occurred.', 'danger')
+            current_app.logger.error(f'Error adding document: {str(e)}')
+            flash('An error occurred while adding the document.', 'error')

    return render_template('document/add_url.html', form=form)


-def get_extension_from_content_type(content_type):
-    content_type_map = {
-        'text/html': 'html',
-        'application/pdf': 'pdf',
-        'text/plain': 'txt',
-        'application/msword': 'doc',
-        'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
-        # Add more mappings as needed
-    }
-    return content_type_map.get(content_type, 'html')  # Default to 'html' if unknown
-
-
@document_bp.route('/add_urls', methods=['GET', 'POST'])
@roles_accepted('Super User', 'Tenant Admin')
 def add_urls():
    form = AddURLsForm()

    if form.validate_on_submit():
-        urls = form.urls.data.split('\n')
-        urls = [url.strip() for url in urls if url.strip()]
+        try:
+            tenant_id = session['tenant']['id']
+            urls = form.urls.data.split('\n')
+            urls = [url.strip() for url in urls if url.strip()]

-        for i, url in enumerate(urls):
-            try:
-                doc_vers = DocumentVersion.query.filter_by(url=url).all()
-                if doc_vers:
-                    current_app.logger.info(f'A document with url {url} already exists. No new document created.')
-                    flash(f'A document with url {url} already exists. No new document created.', 'info')
-                    continue
+            api_input = {
+                'name': form.name.data,
+                'language': form.language.data,
+                'user_context': form.user_context.data,
+                'valid_from': form.valid_from.data
+            }

-                html = fetch_html(url)
-                file = io.BytesIO(html)
+            results = process_multiple_urls(urls, tenant_id, api_input)

-                parsed_url = urlparse(url)
-                path_parts = parsed_url.path.split('/')
-                filename = path_parts[-1] if path_parts[-1] else 'index'
-                if not filename.endswith('.html'):
-                    filename += '.html'
+            for result in results:
+                if result['status'] == 'success':
+                    flash(
+                        f"Processed URL: {result['url']} - Document ID: {result['document_id']}, Version ID: {result['document_version_id']}",
+                        'success')
+                else:
+                    flash(f"Error processing URL: {result['url']} - {result['message']}", 'error')

-                # Use the name prefix if provided, otherwise use the filename
-                doc_name = f"{form.name.data}-{filename}" if form.name.data else filename
+            return redirect(prefixed_url_for('document_bp.documents'))

-                new_doc, new_doc_vers = create_document_stack({
-                    'name': doc_name,
-                    'url': url,
-                    'language': form.language.data,
-                    'user_context': form.user_context.data,
-                    'valid_from': form.valid_from.data
-                }, file, filename, 'html')
-
-                task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
-                    session['tenant']['id'],
-                    new_doc_vers.id,
-                ])
-                current_app.logger.info(f'Embedding creation started for tenant {session["tenant"]["id"]}, '
-                                        f'Document Version {new_doc_vers.id}. '
-                                        f'Embedding creation task: {task.id}')
-                flash(f'Processing on document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task.id}.',
-                      'success')
-
-            except Exception as e:
-                current_app.logger.error(f"Error processing URL {url}: {str(e)}")
-                flash(f'Error processing URL {url}: {str(e)}', 'danger')
-
-        return redirect(prefixed_url_for('document_bp.documents'))
-    else:
-        form_validation_failed(request, form)
+        except Exception as e:
+            current_app.logger.error(f'Error adding multiple URLs: {str(e)}')
+            flash('An error occurred while adding the URLs.', 'error')

    return render_template('document/add_urls.html', form=form)

+
@document_bp.route('/add_youtube', methods=['GET', 'POST'])
@roles_accepted('Super User', 'Tenant Admin')
 def add_youtube():
    form = AddYoutubeForm()

    if form.validate_on_submit():
-        current_app.logger.info(f'Adding Youtube document for tenant {session["tenant"]["id"]}')
-        url = form.url.data
-        current_app.logger.debug(f'Value of language field: {form.language.data}')
+        try:
+            tenant_id = session['tenant']['id']
+            url = form.url.data

-        doc_vers = DocumentVersion.query.filter_by(url=url).all()
-        if doc_vers:
-            current_app.logger.info(f'A document with url {url} already exists. No new document created.')
-            flash(f'A document with url {url} already exists. No new document created.', 'info')
+            api_input = {
+                'name': form.name.data,
+                'language': form.language.data,
+                'user_context': form.user_context.data,
+                'valid_from': form.valid_from.data
+            }
+
+            new_doc, new_doc_vers = prepare_youtube_document(url, tenant_id, api_input)
+            task_id = start_embedding_task(tenant_id, new_doc_vers.id)
+
+            flash(
+                f'Processing on YouTube document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task_id}.',
+                'success')
            return redirect(prefixed_url_for('document_bp.documents'))
-        # As downloading a Youtube document can take quite some time, we offload this downloading to the worker
-        # We just pass a simple file to get things conform
-        file = "Youtube placeholder file"

-        filename = 'placeholder.youtube'
-        extension = 'youtube'
-        form_dict = form_to_dict(form)
-        current_app.logger.debug(f'Form data: {form_dict}')
-
-        new_doc, new_doc_vers = create_document_stack(form_dict, file, filename, extension)
-
-        task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
-            session['tenant']['id'],
-            new_doc_vers.id,
-        ])
-        current_app.logger.info(f'Processing and Embedding on Youtube document started for tenant '
-                                f'{session["tenant"]["id"]}, '
-                                f'Document Version {new_doc_vers.id}. '
-                                f'Processing and Embedding Youtube task: {task.id}')
-        flash(f'Processing on Youtube document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task.id}.',
-              'success')
-
-        return redirect(prefixed_url_for('document_bp.documents'))
-    else:
-        form_validation_failed(request, form)
+        except EveAIYoutubeError as e:
+            flash(str(e), 'error')
+        except (EveAIInvalidLanguageException, EveAIUnsupportedFileType) as e:
+            flash(str(e), 'error')
+        except Exception as e:
+            current_app.logger.error(f'Error adding YouTube document: {str(e)}')
+            flash('An error occurred while adding the YouTube document.', 'error')

    return render_template('document/add_youtube.html', form=form)

@@ -487,7 +416,7 @@ def refresh_document(doc_id):
    current_app.logger.info(f'Document added successfully for tenant {session["tenant"]["id"]}, '
                            f'Document Version {new_doc_vers.id}')

-    upload_file_for_version(new_doc_vers, file, extension)
+    upload_file_for_version(new_doc_vers, file, extension, session["tenant"]["id"])

    task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
        session['tenant']['id'],
@@ -535,116 +464,11 @@ def update_logging_information(obj, timestamp):
    obj.updated_by = current_user.id


-def create_document_stack(form, file, filename, extension):
-    # Create the Document
-    new_doc = create_document(form, filename)
-
-    # Create the DocumentVersion
-    new_doc_vers = create_version_for_document(new_doc,
-                                               form.get('url', ''),
-                                               form.get('language', 'en'),
-                                               form.get('user_context', '')
-                                               )
-
-    try:
-        db.session.add(new_doc)
-        db.session.add(new_doc_vers)
-        db.session.commit()
-    except SQLAlchemyError as e:
-        current_app.logger.error(f'Error adding document for tenant {session["tenant"]["id"]}: {e}')
-        flash('Error adding document.', 'alert')
-        db.session.rollback()
-        error = e.args
-        raise
-    except Exception as e:
-        current_app.logger.error('Unknown error')
-        raise
-
-    current_app.logger.info(f'Document added successfully for tenant {session["tenant"]["id"]}, '
-                            f'Document Version {new_doc.id}')
-
-    upload_file_for_version(new_doc_vers, file, extension)
-
-    return new_doc, new_doc_vers
-
-
 def log_session_state(session, msg=""):
    current_app.logger.debug(f"{msg} - Session dirty: {session.dirty}")
    current_app.logger.debug(f"{msg} - Session new: {session.new}")


-def create_document(form, filename):
-    new_doc = Document()
-    if form['name'] == '':
-        new_doc.name = filename.rsplit('.', 1)[0]
-    else:
-        new_doc.name = form['name']
-
-    if form['valid_from'] and form['valid_from'] != '':
-        new_doc.valid_from = form['valid_from']
-    else:
-        new_doc.valid_from = dt.now(tz.utc)
-    new_doc.tenant_id = session['tenant']['id']
-    set_logging_information(new_doc, dt.now(tz.utc))
-
-    return new_doc
-
-
-def create_version_for_document(document, url, language, user_context):
-    new_doc_vers = DocumentVersion()
-    if url != '':
-        new_doc_vers.url = url
-
-    if language == '':
-        new_doc_vers.language = session['default_language']
-    else:
-        new_doc_vers.language = language
-
-    if user_context != '':
-        new_doc_vers.user_context = user_context
-
-    new_doc_vers.document = document
-
-    set_logging_information(new_doc_vers, dt.now(tz.utc))
-
-    return new_doc_vers
-
-
-def upload_file_for_version(doc_vers, file, extension):
-    doc_vers.file_type = extension
-    doc_vers.file_name = doc_vers.calc_file_name()
-    doc_vers.file_location = doc_vers.calc_file_location()
-
-    # Normally, the tenant bucket should exist. But let's be on the safe side if a migration took place.
-    tenant_id = session['tenant']['id']
-    minio_client.create_tenant_bucket(tenant_id)
-
-    try:
-        minio_client.upload_document_file(
-            tenant_id,
-            doc_vers.doc_id,
-            doc_vers.language,
-            doc_vers.id,
-            doc_vers.file_name,
-            file
-        )
-        db.session.commit()
-        current_app.logger.info(f'Successfully saved document to MinIO for tenant {tenant_id} for '
-                                f'document version {doc_vers.id} while uploading file.')
-    except S3Error as e:
-        db.session.rollback()
-        flash('Error saving document to MinIO.', 'error')
-        current_app.logger.error(
-            f'Error saving document to MinIO for tenant {tenant_id}: {e}')
-        raise
-    except SQLAlchemyError as e:
-        db.session.rollback()
-        flash('Error saving document metadata.', 'error')
-        current_app.logger.error(
-            f'Error saving document metadata for tenant {tenant_id}: {e}')
-        raise
-
-
 def fetch_html(url):
    # Fetches HTML content from a URL
    try: