- Allow for more complex and longer PDFs to be uploaded to Evie. First implmentation of a processor for specific file types.

- Allow URLs to contain other information than just HTML information. It can alose refer to e.g. PDF-files.
2024-08-27 07:05:56 +02:00
parent 2ca006d82c
commit 122d1a18df
9 changed files with 458 additions and 86 deletions
--- a/eveai_app/views/document_views.py
+++ b/eveai_app/views/document_views.py
@@ -12,7 +12,7 @@ from werkzeug.utils import secure_filename
 from sqlalchemy.exc import SQLAlchemyError
 import requests
 from requests.exceptions import SSLError
-from urllib.parse import urlparse
+from urllib.parse import urlparse, unquote
 import io
 from minio.error import S3Error

@@ -89,49 +89,93 @@ def add_document():
 def add_url():
    form = AddURLForm()

-    # If the form is submitted
    if form.validate_on_submit():
        current_app.logger.info(f'Adding url for tenant {session["tenant"]["id"]}')
        url = form.url.data

-        doc_vers = DocumentVersion.query.filter_by(url=url).all()
-        if doc_vers:
-            current_app.logger.info(f'A document with url {url} already exists. No new document created.')
-            flash(f'A document with url {url} already exists. No new document created.', 'info')
+        try:
+            response = requests.head(url, allow_redirects=True)
+            content_type = response.headers.get('Content-Type', '').split(';')[0]
+
+            # Determine file extension based on Content-Type
+            extension = get_extension_from_content_type(content_type)
+
+            # Generate filename
+            parsed_url = urlparse(url)
+            path = unquote(parsed_url.path)
+            filename = os.path.basename(path)
+
+            if not filename or '.' not in filename:
+                # Use the last part of the path or a default name
+                filename = path.strip('/').split('/')[-1] or 'document'
+                filename = secure_filename(f"{filename}.{extension}")
+            else:
+                filename = secure_filename(filename)
+
+            # Check if a document with this URL already exists
+            existing_doc = DocumentVersion.query.filter_by(url=url).first()
+            if existing_doc:
+                flash(f'A document with URL {url} already exists. No new document created.', 'info')
+                return redirect(prefixed_url_for('document_bp.documents'))
+
+            # Download the content
+            response = requests.get(url)
+            response.raise_for_status()
+            file_content = response.content
+
+            # Create document and document version
+            form_dict = form_to_dict(form)
+            new_doc, new_doc_vers = create_document_stack(form_dict, file_content, filename, extension)
+
+            # Upload file to storage
+            minio_client.upload_document_file(
+                session['tenant']['id'],
+                new_doc_vers.doc_id,
+                new_doc_vers.language,
+                new_doc_vers.id,
+                filename,
+                file_content
+            )
+
+            # Start embedding task
+            task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
+                session['tenant']['id'],
+                new_doc_vers.id,
+            ])
+
+            current_app.logger.info(f'Embedding creation started for tenant {session["tenant"]["id"]}, '
+                                    f'Document Version {new_doc_vers.id}. '
+                                    f'Embedding creation task: {task.id}')
+            flash(f'Processing on document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task.id}.',
+                  'success')
+
            return redirect(prefixed_url_for('document_bp.documents'))
-        # Only when no document with URL exists
-        html = fetch_html(url)
-        file = io.BytesIO(html)

-        parsed_url = urlparse(url)
-        path_parts = parsed_url.path.split('/')
-        filename = path_parts[-1]
-        if filename == '':
-            filename = 'index'
-        if not filename.endswith('.html'):
-            filename += '.html'
-        extension = 'html'
-        form_dict = form_to_dict(form)
-
-        new_doc, new_doc_vers = create_document_stack(form_dict, file, filename, extension)
-
-        task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
-            session['tenant']['id'],
-            new_doc_vers.id,
-        ])
-        current_app.logger.info(f'Embedding creation started for tenant {session["tenant"]["id"]}, '
-                                f'Document Version {new_doc_vers.id}. '
-                                f'Embedding creation task: {task.id}')
-        flash(f'Processing on document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task.id}.',
-              'success')
-
-        return redirect(prefixed_url_for('document_bp.documents'))
-    else:
-        form_validation_failed(request, form)
+        except requests.RequestException as e:
+            current_app.logger.error(f'Error fetching URL {url}: {str(e)}')
+            flash(f'Error fetching URL: {str(e)}', 'danger')
+        except SQLAlchemyError as e:
+            current_app.logger.error(f'Database error: {str(e)}')
+            flash('An error occurred while saving the document.', 'danger')
+        except Exception as e:
+            current_app.logger.error(f'Unexpected error: {str(e)}')
+            flash('An unexpected error occurred.', 'danger')

    return render_template('document/add_url.html', form=form)


+def get_extension_from_content_type(content_type):
+    content_type_map = {
+        'text/html': 'html',
+        'application/pdf': 'pdf',
+        'text/plain': 'txt',
+        'application/msword': 'doc',
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
+        # Add more mappings as needed
+    }
+    return content_type_map.get(content_type, 'html')  # Default to 'html' if unknown
+
+
@document_bp.route('/add_urls', methods=['GET', 'POST'])
@roles_accepted('Super User', 'Tenant Admin')
 def add_urls():
@@ -358,6 +402,8 @@ def handle_document_version_selection():

    action = request.form['action']

+    current_app.logger.debug(f'Triggered Document Version Action: {action}')
+
    match action:
        case 'edit_document_version':
            return redirect(prefixed_url_for('document_bp.edit_document_version', document_version_id=doc_vers_id))