From 70de4c032869bfd53e2700138713247c2128924a Mon Sep 17 00:00:00 2001 From: Josako Date: Mon, 19 May 2025 17:18:16 +0200 Subject: [PATCH] Improve HTML Processing + Introduction of Processed File viewer --- .../entitlements/license_period_services.py | 2 +- common/utils/template_filters.py | 42 +++++++ .../templates/document/document_versions.html | 3 +- .../document/document_versions_list_view.html | 1 + .../view_document_version_markdown.html | 115 ++++++++++++++++++ eveai_app/views/document_views.py | 60 ++++++++- eveai_workers/processors/html_processor.py | 2 +- requirements.txt | 3 +- 8 files changed, 222 insertions(+), 6 deletions(-) create mode 100644 eveai_app/templates/document/view_document_version_markdown.html diff --git a/common/services/entitlements/license_period_services.py b/common/services/entitlements/license_period_services.py index b87596e..569fde1 100644 --- a/common/services/entitlements/license_period_services.py +++ b/common/services/entitlements/license_period_services.py @@ -68,7 +68,7 @@ class LicensePeriodServices: # Status is PENDING, so no prepaid payment received. There is no license period we can use. # We allow for a delay of 5 days before raising an exception. current_date = dt.now(tz.utc).date() - delta = abs(current_date - license_period.period_start_date) + delta = abs(current_date - license_period.period_start) if delta > timedelta(days=current_app.config.get('ENTITLEMENTS_MAX_PENDING_DAYS', 5)): raise EveAIPendingLicensePeriod() case PeriodStatus.ACTIVE: diff --git a/common/utils/template_filters.py b/common/utils/template_filters.py index d42b47b..4527ef1 100644 --- a/common/utils/template_filters.py +++ b/common/utils/template_filters.py @@ -1,6 +1,8 @@ # common/utils/filters.py import pytz +import markdown +from markupsafe import Markup from datetime import datetime from common.utils.nginx_utils import prefixed_url_for as puf @@ -43,6 +45,44 @@ def status_color(status_name): return colors.get(status_name, 'secondary') +def render_markdown(text): + """ + Renders markdown to HTML using Python's markdown library. + Includes common extensions for better rendering. + """ + if not text: + return "" + + # Verwijder de triple backticks en markdown label + text = clean_markdown(text) + + # Render de markdown met extensies + return Markup(markdown.markdown(text, extensions=[ + 'markdown.extensions.fenced_code', + 'markdown.extensions.codehilite', + 'markdown.extensions.tables', + 'markdown.extensions.toc' + ])) + + +def clean_markdown(text): + """ + Verwijdert triple backticks en markdown aanduiding uit de tekst + """ + if not text: + return "" + + text = text.strip() + if text.startswith("```markdown"): + text = text[len("```markdown"):].strip() + elif text.startswith("```"): + text = text[3:].strip() + if text.endswith("```"): + text = text[:-3].strip() + + return text + + def prefixed_url_for(endpoint): return puf(endpoint) @@ -55,5 +95,7 @@ def register_filters(app): app.jinja_env.filters['time_difference'] = time_difference app.jinja_env.filters['status_color'] = status_color app.jinja_env.filters['prefixed_url_for'] = prefixed_url_for + app.jinja_env.filters['markdown'] = render_markdown + app.jinja_env.filters['clean_markdown'] = clean_markdown app.jinja_env.globals['prefixed_url_for'] = prefixed_url_for diff --git a/eveai_app/templates/document/document_versions.html b/eveai_app/templates/document/document_versions.html index b32c6f9..7c6e80a 100644 --- a/eveai_app/templates/document/document_versions.html +++ b/eveai_app/templates/document/document_versions.html @@ -13,6 +13,7 @@ {{ render_selectable_table(headers=["ID", "URL", "Object Name", "File Type", "Process.", "Proces. Start", "Proces. Finish", "Proces. Error"], rows=rows, selectable=True, id="versionsTable") }}
+
@@ -21,4 +22,4 @@ {% block content_footer %} {{ render_pagination(pagination, 'document_bp.documents') }} -{% endblock %} \ No newline at end of file +{% endblock %} diff --git a/eveai_app/templates/document/document_versions_list_view.html b/eveai_app/templates/document/document_versions_list_view.html index 7359736..2a759be 100644 --- a/eveai_app/templates/document/document_versions_list_view.html +++ b/eveai_app/templates/document/document_versions_list_view.html @@ -37,6 +37,7 @@
+
diff --git a/eveai_app/templates/document/view_document_version_markdown.html b/eveai_app/templates/document/view_document_version_markdown.html new file mode 100644 index 0000000..2e1def3 --- /dev/null +++ b/eveai_app/templates/document/view_document_version_markdown.html @@ -0,0 +1,115 @@ +{% extends "base.html" %} +{% block title %}Document Version Markdown{% endblock %} + +{% block content_title %}Document Version Markdown{% endblock %} +{% block content_description %}Markdown inhoud van document versie {{ document_version.id }}.{% endblock %} + +{% block content %} +
+
+
+
Document Informatie
+
+
+

Document ID: {{ document_version.doc_id }}

+

Versie ID: {{ document_version.id }}

+

Object Naam: {{ document_version.object_name }}

+

Taal: {{ document_version.language }}

+
+
+ +
+
+
Markdown Content
+
+ + +
+
+
+ + + + +
+ {{ markdown_content | markdown }} +
+
+
+
+{% endblock %} + +{% block styles %} +{{ super() }} + + +{% endblock %} + +{% block scripts %} +{{ super() }} + +{% endblock %} \ No newline at end of file diff --git a/eveai_app/views/document_views.py b/eveai_app/views/document_views.py index 2f8fcf3..ebb21fb 100644 --- a/eveai_app/views/document_views.py +++ b/eveai_app/views/document_views.py @@ -12,7 +12,7 @@ from requests.exceptions import SSLError import json from common.models.document import Document, DocumentVersion, Catalog, Retriever, Processor -from common.extensions import db, cache_manager +from common.extensions import db, cache_manager, minio_client from common.models.interaction import Specialist, SpecialistRetriever from common.utils.document_utils import create_document_stack, start_embedding_task, process_url, \ edit_document, \ @@ -215,7 +215,7 @@ def edit_processor(processor_id): try: db.session.add(processor) db.session.commit() - flash('Retriever updated successfully!', 'success') + flash('Processor updated successfully!', 'success') current_app.logger.info(f'Processor {processor.id} updated successfully') except SQLAlchemyError as e: db.session.rollback() @@ -649,6 +649,7 @@ def handle_document_version_selection(): return redirect(prefixed_url_for('document_bp.document_versions_list')) action = request.form['action'] + current_app.logger.debug(f'Action: {action}') match action: case 'edit_document_version': @@ -656,6 +657,9 @@ def handle_document_version_selection(): case 'process_document_version': process_version(doc_vers_id) # Add more conditions for other actions + case 'view_document_version_markdown': + return redirect(prefixed_url_for('document_bp.view_document_version_markdown', + document_version_id=doc_vers_id)) doc_vers = DocumentVersion.query.get_or_404(doc_vers_id) return redirect(prefixed_url_for('document_bp.document_versions', document_id=doc_vers.doc_id)) @@ -772,6 +776,45 @@ def document_versions_list(): return view.get() +@document_bp.route('/view_document_version_markdown/', methods=['GET']) +@roles_accepted('Super User', 'Partner Admin', 'Tenant Admin') +def view_document_version_markdown(document_version_id): + current_app.logger.debug(f'Viewing document version markdown {document_version_id}') + # Retrieve document version + document_version = DocumentVersion.query.get_or_404(document_version_id) + + # retrieve tenant information + tenant_id = session.get('tenant').get('id') + + try: + # Generate markdown filename + markdown_filename = f"{document_version.id}.md" + markdown_object_name = minio_client.generate_object_name(document_version.doc_id, document_version.language, + document_version.id, markdown_filename) + current_app.logger.debug(f'Markdown object name: {markdown_object_name}') + # Download actual markdown file + file_data = minio_client.download_document_file( + tenant_id, + document_version.bucket_name, + markdown_object_name, + ) + + # Decodeer de binaire data naar UTF-8 tekst + markdown_content = file_data.decode('utf-8') + current_app.logger.debug(f'Markdown content: {markdown_content}') + + # Render de template met de markdown inhoud + return render_template( + 'document/view_document_version_markdown.html', + document_version=document_version, + markdown_content=markdown_content + ) + except Exception as e: + current_app.logger.error(f"Error retrieving markdown for document version {document_version_id}: {str(e)}") + flash(f"Error retrieving processed document: {str(e)}", "danger") + return redirect(prefixed_url_for('document_bp.document_versions')) + + def refresh_all_documents(): for doc in Document.query.all(): refresh_document(doc.id) @@ -842,3 +885,16 @@ def fetch_html(url): response.raise_for_status() # Will raise an exception for bad requests return response.content + + +def clean_markdown(markdown): + """Functie die triple backticks uit markdown verwijdert""" + markdown = markdown.strip() + if markdown.startswith("```markdown"): + markdown = markdown[len("```markdown"):].strip() + elif markdown.startswith("```"): + markdown = markdown[3:].strip() + if markdown.endswith("```"): + markdown = markdown[:-3].strip() + return markdown + diff --git a/eveai_workers/processors/html_processor.py b/eveai_workers/processors/html_processor.py index 711cca6..6e14c3b 100644 --- a/eveai_workers/processors/html_processor.py +++ b/eveai_workers/processors/html_processor.py @@ -106,7 +106,7 @@ class HTMLProcessor(BaseProcessor): current_chunk = [] current_size = 0 - for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span', 'table']): + for element in soup.find_all(self.html_tags): element_html = str(element) element_size = len(element_html) diff --git a/requirements.txt b/requirements.txt index 0452266..6d3c506 100644 --- a/requirements.txt +++ b/requirements.txt @@ -91,4 +91,5 @@ contextvars~=2.4 pandas~=2.2.3 prometheus_client~=0.21.1 scaleway~=2.9.0 -html2text~=2025.4.15 \ No newline at end of file +html2text~=2025.4.15 +markdown~=3.8