Improve HTML Processing + Introduction of Processed File viewer

This commit is contained in:
Josako
2025-05-19 17:18:16 +02:00
parent d2bb51a4a8
commit 70de4c0328
8 changed files with 222 additions and 6 deletions

View File

@@ -68,7 +68,7 @@ class LicensePeriodServices:
# Status is PENDING, so no prepaid payment received. There is no license period we can use. # Status is PENDING, so no prepaid payment received. There is no license period we can use.
# We allow for a delay of 5 days before raising an exception. # We allow for a delay of 5 days before raising an exception.
current_date = dt.now(tz.utc).date() current_date = dt.now(tz.utc).date()
delta = abs(current_date - license_period.period_start_date) delta = abs(current_date - license_period.period_start)
if delta > timedelta(days=current_app.config.get('ENTITLEMENTS_MAX_PENDING_DAYS', 5)): if delta > timedelta(days=current_app.config.get('ENTITLEMENTS_MAX_PENDING_DAYS', 5)):
raise EveAIPendingLicensePeriod() raise EveAIPendingLicensePeriod()
case PeriodStatus.ACTIVE: case PeriodStatus.ACTIVE:

View File

@@ -1,6 +1,8 @@
# common/utils/filters.py # common/utils/filters.py
import pytz import pytz
import markdown
from markupsafe import Markup
from datetime import datetime from datetime import datetime
from common.utils.nginx_utils import prefixed_url_for as puf from common.utils.nginx_utils import prefixed_url_for as puf
@@ -43,6 +45,44 @@ def status_color(status_name):
return colors.get(status_name, 'secondary') return colors.get(status_name, 'secondary')
def render_markdown(text):
"""
Renders markdown to HTML using Python's markdown library.
Includes common extensions for better rendering.
"""
if not text:
return ""
# Verwijder de triple backticks en markdown label
text = clean_markdown(text)
# Render de markdown met extensies
return Markup(markdown.markdown(text, extensions=[
'markdown.extensions.fenced_code',
'markdown.extensions.codehilite',
'markdown.extensions.tables',
'markdown.extensions.toc'
]))
def clean_markdown(text):
"""
Verwijdert triple backticks en markdown aanduiding uit de tekst
"""
if not text:
return ""
text = text.strip()
if text.startswith("```markdown"):
text = text[len("```markdown"):].strip()
elif text.startswith("```"):
text = text[3:].strip()
if text.endswith("```"):
text = text[:-3].strip()
return text
def prefixed_url_for(endpoint): def prefixed_url_for(endpoint):
return puf(endpoint) return puf(endpoint)
@@ -55,5 +95,7 @@ def register_filters(app):
app.jinja_env.filters['time_difference'] = time_difference app.jinja_env.filters['time_difference'] = time_difference
app.jinja_env.filters['status_color'] = status_color app.jinja_env.filters['status_color'] = status_color
app.jinja_env.filters['prefixed_url_for'] = prefixed_url_for app.jinja_env.filters['prefixed_url_for'] = prefixed_url_for
app.jinja_env.filters['markdown'] = render_markdown
app.jinja_env.filters['clean_markdown'] = clean_markdown
app.jinja_env.globals['prefixed_url_for'] = prefixed_url_for app.jinja_env.globals['prefixed_url_for'] = prefixed_url_for

View File

@@ -13,6 +13,7 @@
{{ render_selectable_table(headers=["ID", "URL", "Object Name", "File Type", "Process.", "Proces. Start", "Proces. Finish", "Proces. Error"], rows=rows, selectable=True, id="versionsTable") }} {{ render_selectable_table(headers=["ID", "URL", "Object Name", "File Type", "Process.", "Proces. Start", "Proces. Finish", "Proces. Error"], rows=rows, selectable=True, id="versionsTable") }}
<div class="form-group mt-3"> <div class="form-group mt-3">
<button type="submit" name="action" value="edit_document_version" class="btn btn-primary">Edit Document Version</button> <button type="submit" name="action" value="edit_document_version" class="btn btn-primary">Edit Document Version</button>
<button type="submit" name="action" value="view_document_version_markdown" class="btn btn-danger">View Processed Document</button>
<button type="submit" name="action" value="process_document_version" class="btn btn-danger">Process Document Version</button> <button type="submit" name="action" value="process_document_version" class="btn btn-danger">Process Document Version</button>
</div> </div>
</form> </form>

View File

@@ -37,6 +37,7 @@
<div class="form-group mt-3 d-flex justify-content-between"> <div class="form-group mt-3 d-flex justify-content-between">
<div> <div>
<button type="submit" name="action" value="edit_document_version" class="btn btn-primary" onclick="return validateTableSelection('documentVersionsForm')">Edit Document Version</button> <button type="submit" name="action" value="edit_document_version" class="btn btn-primary" onclick="return validateTableSelection('documentVersionsForm')">Edit Document Version</button>
<button type="submit" name="action" value="view_document_version_markdown" class="btn btn-danger">View Processed Document</button>
<button type="submit" name="action" value="process_document_version" class="btn btn-danger" onclick="return validateTableSelection('documentVersionsForm')">Process Document Version</button> <button type="submit" name="action" value="process_document_version" class="btn btn-danger" onclick="return validateTableSelection('documentVersionsForm')">Process Document Version</button>
</div> </div>
</div> </div>

View File

@@ -0,0 +1,115 @@
{% extends "base.html" %}
{% block title %}Document Version Markdown{% endblock %}
{% block content_title %}Document Version Markdown{% endblock %}
{% block content_description %}Markdown inhoud van document versie {{ document_version.id }}.{% endblock %}
{% block content %}
<div class="container mt-5">
<div class="card">
<div class="card-header bg-light">
<h5>Document Informatie</h5>
</div>
<div class="card-body">
<p><strong>Document ID:</strong> {{ document_version.doc_id }}</p>
<p><strong>Versie ID:</strong> {{ document_version.id }}</p>
<p><strong>Object Naam:</strong> {{ document_version.object_name }}</p>
<p><strong>Taal:</strong> {{ document_version.language }}</p>
</div>
</div>
<div class="card mt-4">
<div class="card-header bg-light d-flex justify-content-between align-items-center">
<h5>Markdown Content</h5>
<div class="btn-group" role="group">
<button class="btn btn-sm btn-outline-secondary" id="showRaw">Toon ruw</button>
<button class="btn btn-sm btn-outline-primary active" id="showRendered">Toon gerenderd</button>
</div>
</div>
<div class="card-body">
<!-- Ruwe markdown weergave (standaard verborgen) -->
<div id="rawMarkdown" class="code-wrapper" style="display: none;">
<pre><code class="language-markdown">{{ markdown_content }}</code></pre>
</div>
<!-- Gerenderde markdown weergave -->
<div id="renderedMarkdown" class="markdown-body">
{{ markdown_content | markdown }}
</div>
</div>
</div>
</div>
{% endblock %}
{% block styles %}
{{ super() }}
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/github-markdown-css@4.0.0/github-markdown.min.css">
<style>
pre, code {
margin: 0;
padding: 0;
white-space: pre-wrap !important;
word-wrap: break-word !important;
max-width: 100%;
}
pre code {
padding: 1rem !important;
border-radius: 4px;
font-size: 0.75rem;
line-height: 1.5;
white-space: pre-wrap !important;
}
.code-wrapper {
position: relative;
width: 100%;
}
.markdown-body {
padding: 1rem;
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif;
line-height: 1.6;
}
/* Zorg ervoor dat de markdown goed uitziet in de donkere modus van de app (optioneel) */
@media (prefers-color-scheme: dark) {
.markdown-body {
color: #c9d1d9;
background-color: #0d1117;
}
}
</style>
{% endblock %}
{% block scripts %}
{{ super() }}
<script>
document.addEventListener('DOMContentLoaded', function() {
// Initialize syntax highlighting
document.querySelectorAll('pre code').forEach((block) => {
hljs.highlightElement(block);
});
// Schakelknoppen voor weergave
const showRawBtn = document.getElementById('showRaw');
const showRenderedBtn = document.getElementById('showRendered');
const rawMarkdown = document.getElementById('rawMarkdown');
const renderedMarkdown = document.getElementById('renderedMarkdown');
showRawBtn.addEventListener('click', function() {
rawMarkdown.style.display = 'block';
renderedMarkdown.style.display = 'none';
showRawBtn.classList.add('active');
showRenderedBtn.classList.remove('active');
});
showRenderedBtn.addEventListener('click', function() {
rawMarkdown.style.display = 'none';
renderedMarkdown.style.display = 'block';
showRawBtn.classList.remove('active');
showRenderedBtn.classList.add('active');
});
});
</script>
{% endblock %}

View File

@@ -12,7 +12,7 @@ from requests.exceptions import SSLError
import json import json
from common.models.document import Document, DocumentVersion, Catalog, Retriever, Processor from common.models.document import Document, DocumentVersion, Catalog, Retriever, Processor
from common.extensions import db, cache_manager from common.extensions import db, cache_manager, minio_client
from common.models.interaction import Specialist, SpecialistRetriever from common.models.interaction import Specialist, SpecialistRetriever
from common.utils.document_utils import create_document_stack, start_embedding_task, process_url, \ from common.utils.document_utils import create_document_stack, start_embedding_task, process_url, \
edit_document, \ edit_document, \
@@ -215,7 +215,7 @@ def edit_processor(processor_id):
try: try:
db.session.add(processor) db.session.add(processor)
db.session.commit() db.session.commit()
flash('Retriever updated successfully!', 'success') flash('Processor updated successfully!', 'success')
current_app.logger.info(f'Processor {processor.id} updated successfully') current_app.logger.info(f'Processor {processor.id} updated successfully')
except SQLAlchemyError as e: except SQLAlchemyError as e:
db.session.rollback() db.session.rollback()
@@ -649,6 +649,7 @@ def handle_document_version_selection():
return redirect(prefixed_url_for('document_bp.document_versions_list')) return redirect(prefixed_url_for('document_bp.document_versions_list'))
action = request.form['action'] action = request.form['action']
current_app.logger.debug(f'Action: {action}')
match action: match action:
case 'edit_document_version': case 'edit_document_version':
@@ -656,6 +657,9 @@ def handle_document_version_selection():
case 'process_document_version': case 'process_document_version':
process_version(doc_vers_id) process_version(doc_vers_id)
# Add more conditions for other actions # Add more conditions for other actions
case 'view_document_version_markdown':
return redirect(prefixed_url_for('document_bp.view_document_version_markdown',
document_version_id=doc_vers_id))
doc_vers = DocumentVersion.query.get_or_404(doc_vers_id) doc_vers = DocumentVersion.query.get_or_404(doc_vers_id)
return redirect(prefixed_url_for('document_bp.document_versions', document_id=doc_vers.doc_id)) return redirect(prefixed_url_for('document_bp.document_versions', document_id=doc_vers.doc_id))
@@ -772,6 +776,45 @@ def document_versions_list():
return view.get() return view.get()
@document_bp.route('/view_document_version_markdown/<int:document_version_id>', methods=['GET'])
@roles_accepted('Super User', 'Partner Admin', 'Tenant Admin')
def view_document_version_markdown(document_version_id):
current_app.logger.debug(f'Viewing document version markdown {document_version_id}')
# Retrieve document version
document_version = DocumentVersion.query.get_or_404(document_version_id)
# retrieve tenant information
tenant_id = session.get('tenant').get('id')
try:
# Generate markdown filename
markdown_filename = f"{document_version.id}.md"
markdown_object_name = minio_client.generate_object_name(document_version.doc_id, document_version.language,
document_version.id, markdown_filename)
current_app.logger.debug(f'Markdown object name: {markdown_object_name}')
# Download actual markdown file
file_data = minio_client.download_document_file(
tenant_id,
document_version.bucket_name,
markdown_object_name,
)
# Decodeer de binaire data naar UTF-8 tekst
markdown_content = file_data.decode('utf-8')
current_app.logger.debug(f'Markdown content: {markdown_content}')
# Render de template met de markdown inhoud
return render_template(
'document/view_document_version_markdown.html',
document_version=document_version,
markdown_content=markdown_content
)
except Exception as e:
current_app.logger.error(f"Error retrieving markdown for document version {document_version_id}: {str(e)}")
flash(f"Error retrieving processed document: {str(e)}", "danger")
return redirect(prefixed_url_for('document_bp.document_versions'))
def refresh_all_documents(): def refresh_all_documents():
for doc in Document.query.all(): for doc in Document.query.all():
refresh_document(doc.id) refresh_document(doc.id)
@@ -842,3 +885,16 @@ def fetch_html(url):
response.raise_for_status() # Will raise an exception for bad requests response.raise_for_status() # Will raise an exception for bad requests
return response.content return response.content
def clean_markdown(markdown):
"""Functie die triple backticks uit markdown verwijdert"""
markdown = markdown.strip()
if markdown.startswith("```markdown"):
markdown = markdown[len("```markdown"):].strip()
elif markdown.startswith("```"):
markdown = markdown[3:].strip()
if markdown.endswith("```"):
markdown = markdown[:-3].strip()
return markdown

View File

@@ -106,7 +106,7 @@ class HTMLProcessor(BaseProcessor):
current_chunk = [] current_chunk = []
current_size = 0 current_size = 0
for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span', 'table']): for element in soup.find_all(self.html_tags):
element_html = str(element) element_html = str(element)
element_size = len(element_html) element_size = len(element_html)

View File

@@ -92,3 +92,4 @@ pandas~=2.2.3
prometheus_client~=0.21.1 prometheus_client~=0.21.1
scaleway~=2.9.0 scaleway~=2.9.0
html2text~=2025.4.15 html2text~=2025.4.15
markdown~=3.8