- Allow for more complex and longer PDFs to be uploaded to Evie. First implmentation of a processor for specific file types.
- Allow URLs to contain other information than just HTML information. It can alose refer to e.g. PDF-files.
This commit is contained in:
@@ -12,7 +12,7 @@ from werkzeug.utils import secure_filename
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
import requests
|
||||
from requests.exceptions import SSLError
|
||||
from urllib.parse import urlparse
|
||||
from urllib.parse import urlparse, unquote
|
||||
import io
|
||||
from minio.error import S3Error
|
||||
|
||||
@@ -89,49 +89,93 @@ def add_document():
|
||||
def add_url():
|
||||
form = AddURLForm()
|
||||
|
||||
# If the form is submitted
|
||||
if form.validate_on_submit():
|
||||
current_app.logger.info(f'Adding url for tenant {session["tenant"]["id"]}')
|
||||
url = form.url.data
|
||||
|
||||
doc_vers = DocumentVersion.query.filter_by(url=url).all()
|
||||
if doc_vers:
|
||||
current_app.logger.info(f'A document with url {url} already exists. No new document created.')
|
||||
flash(f'A document with url {url} already exists. No new document created.', 'info')
|
||||
try:
|
||||
response = requests.head(url, allow_redirects=True)
|
||||
content_type = response.headers.get('Content-Type', '').split(';')[0]
|
||||
|
||||
# Determine file extension based on Content-Type
|
||||
extension = get_extension_from_content_type(content_type)
|
||||
|
||||
# Generate filename
|
||||
parsed_url = urlparse(url)
|
||||
path = unquote(parsed_url.path)
|
||||
filename = os.path.basename(path)
|
||||
|
||||
if not filename or '.' not in filename:
|
||||
# Use the last part of the path or a default name
|
||||
filename = path.strip('/').split('/')[-1] or 'document'
|
||||
filename = secure_filename(f"{filename}.{extension}")
|
||||
else:
|
||||
filename = secure_filename(filename)
|
||||
|
||||
# Check if a document with this URL already exists
|
||||
existing_doc = DocumentVersion.query.filter_by(url=url).first()
|
||||
if existing_doc:
|
||||
flash(f'A document with URL {url} already exists. No new document created.', 'info')
|
||||
return redirect(prefixed_url_for('document_bp.documents'))
|
||||
|
||||
# Download the content
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
file_content = response.content
|
||||
|
||||
# Create document and document version
|
||||
form_dict = form_to_dict(form)
|
||||
new_doc, new_doc_vers = create_document_stack(form_dict, file_content, filename, extension)
|
||||
|
||||
# Upload file to storage
|
||||
minio_client.upload_document_file(
|
||||
session['tenant']['id'],
|
||||
new_doc_vers.doc_id,
|
||||
new_doc_vers.language,
|
||||
new_doc_vers.id,
|
||||
filename,
|
||||
file_content
|
||||
)
|
||||
|
||||
# Start embedding task
|
||||
task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
|
||||
session['tenant']['id'],
|
||||
new_doc_vers.id,
|
||||
])
|
||||
|
||||
current_app.logger.info(f'Embedding creation started for tenant {session["tenant"]["id"]}, '
|
||||
f'Document Version {new_doc_vers.id}. '
|
||||
f'Embedding creation task: {task.id}')
|
||||
flash(f'Processing on document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task.id}.',
|
||||
'success')
|
||||
|
||||
return redirect(prefixed_url_for('document_bp.documents'))
|
||||
# Only when no document with URL exists
|
||||
html = fetch_html(url)
|
||||
file = io.BytesIO(html)
|
||||
|
||||
parsed_url = urlparse(url)
|
||||
path_parts = parsed_url.path.split('/')
|
||||
filename = path_parts[-1]
|
||||
if filename == '':
|
||||
filename = 'index'
|
||||
if not filename.endswith('.html'):
|
||||
filename += '.html'
|
||||
extension = 'html'
|
||||
form_dict = form_to_dict(form)
|
||||
|
||||
new_doc, new_doc_vers = create_document_stack(form_dict, file, filename, extension)
|
||||
|
||||
task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
|
||||
session['tenant']['id'],
|
||||
new_doc_vers.id,
|
||||
])
|
||||
current_app.logger.info(f'Embedding creation started for tenant {session["tenant"]["id"]}, '
|
||||
f'Document Version {new_doc_vers.id}. '
|
||||
f'Embedding creation task: {task.id}')
|
||||
flash(f'Processing on document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task.id}.',
|
||||
'success')
|
||||
|
||||
return redirect(prefixed_url_for('document_bp.documents'))
|
||||
else:
|
||||
form_validation_failed(request, form)
|
||||
except requests.RequestException as e:
|
||||
current_app.logger.error(f'Error fetching URL {url}: {str(e)}')
|
||||
flash(f'Error fetching URL: {str(e)}', 'danger')
|
||||
except SQLAlchemyError as e:
|
||||
current_app.logger.error(f'Database error: {str(e)}')
|
||||
flash('An error occurred while saving the document.', 'danger')
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Unexpected error: {str(e)}')
|
||||
flash('An unexpected error occurred.', 'danger')
|
||||
|
||||
return render_template('document/add_url.html', form=form)
|
||||
|
||||
|
||||
def get_extension_from_content_type(content_type):
|
||||
content_type_map = {
|
||||
'text/html': 'html',
|
||||
'application/pdf': 'pdf',
|
||||
'text/plain': 'txt',
|
||||
'application/msword': 'doc',
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
|
||||
# Add more mappings as needed
|
||||
}
|
||||
return content_type_map.get(content_type, 'html') # Default to 'html' if unknown
|
||||
|
||||
|
||||
@document_bp.route('/add_urls', methods=['GET', 'POST'])
|
||||
@roles_accepted('Super User', 'Tenant Admin')
|
||||
def add_urls():
|
||||
@@ -358,6 +402,8 @@ def handle_document_version_selection():
|
||||
|
||||
action = request.form['action']
|
||||
|
||||
current_app.logger.debug(f'Triggered Document Version Action: {action}')
|
||||
|
||||
match action:
|
||||
case 'edit_document_version':
|
||||
return redirect(prefixed_url_for('document_bp.edit_document_version', document_version_id=doc_vers_id))
|
||||
|
||||
Reference in New Issue
Block a user