Improve HTML Processing + Introduction of Processed File viewer

This commit is contained in:
Josako
2025-05-19 17:18:16 +02:00
parent d2bb51a4a8
commit 70de4c0328
8 changed files with 222 additions and 6 deletions

View File

@@ -12,7 +12,7 @@ from requests.exceptions import SSLError
import json
from common.models.document import Document, DocumentVersion, Catalog, Retriever, Processor
from common.extensions import db, cache_manager
from common.extensions import db, cache_manager, minio_client
from common.models.interaction import Specialist, SpecialistRetriever
from common.utils.document_utils import create_document_stack, start_embedding_task, process_url, \
edit_document, \
@@ -215,7 +215,7 @@ def edit_processor(processor_id):
try:
db.session.add(processor)
db.session.commit()
flash('Retriever updated successfully!', 'success')
flash('Processor updated successfully!', 'success')
current_app.logger.info(f'Processor {processor.id} updated successfully')
except SQLAlchemyError as e:
db.session.rollback()
@@ -649,6 +649,7 @@ def handle_document_version_selection():
return redirect(prefixed_url_for('document_bp.document_versions_list'))
action = request.form['action']
current_app.logger.debug(f'Action: {action}')
match action:
case 'edit_document_version':
@@ -656,6 +657,9 @@ def handle_document_version_selection():
case 'process_document_version':
process_version(doc_vers_id)
# Add more conditions for other actions
case 'view_document_version_markdown':
return redirect(prefixed_url_for('document_bp.view_document_version_markdown',
document_version_id=doc_vers_id))
doc_vers = DocumentVersion.query.get_or_404(doc_vers_id)
return redirect(prefixed_url_for('document_bp.document_versions', document_id=doc_vers.doc_id))
@@ -772,6 +776,45 @@ def document_versions_list():
return view.get()
@document_bp.route('/view_document_version_markdown/<int:document_version_id>', methods=['GET'])
@roles_accepted('Super User', 'Partner Admin', 'Tenant Admin')
def view_document_version_markdown(document_version_id):
current_app.logger.debug(f'Viewing document version markdown {document_version_id}')
# Retrieve document version
document_version = DocumentVersion.query.get_or_404(document_version_id)
# retrieve tenant information
tenant_id = session.get('tenant').get('id')
try:
# Generate markdown filename
markdown_filename = f"{document_version.id}.md"
markdown_object_name = minio_client.generate_object_name(document_version.doc_id, document_version.language,
document_version.id, markdown_filename)
current_app.logger.debug(f'Markdown object name: {markdown_object_name}')
# Download actual markdown file
file_data = minio_client.download_document_file(
tenant_id,
document_version.bucket_name,
markdown_object_name,
)
# Decodeer de binaire data naar UTF-8 tekst
markdown_content = file_data.decode('utf-8')
current_app.logger.debug(f'Markdown content: {markdown_content}')
# Render de template met de markdown inhoud
return render_template(
'document/view_document_version_markdown.html',
document_version=document_version,
markdown_content=markdown_content
)
except Exception as e:
current_app.logger.error(f"Error retrieving markdown for document version {document_version_id}: {str(e)}")
flash(f"Error retrieving processed document: {str(e)}", "danger")
return redirect(prefixed_url_for('document_bp.document_versions'))
def refresh_all_documents():
for doc in Document.query.all():
refresh_document(doc.id)
@@ -842,3 +885,16 @@ def fetch_html(url):
response.raise_for_status() # Will raise an exception for bad requests
return response.content
def clean_markdown(markdown):
"""Functie die triple backticks uit markdown verwijdert"""
markdown = markdown.strip()
if markdown.startswith("```markdown"):
markdown = markdown[len("```markdown"):].strip()
elif markdown.startswith("```"):
markdown = markdown[3:].strip()
if markdown.endswith("```"):
markdown = markdown[:-3].strip()
return markdown