Improve HTML Processing + Introduction of Processed File viewer
This commit is contained in:
@@ -68,7 +68,7 @@ class LicensePeriodServices:
|
||||
# Status is PENDING, so no prepaid payment received. There is no license period we can use.
|
||||
# We allow for a delay of 5 days before raising an exception.
|
||||
current_date = dt.now(tz.utc).date()
|
||||
delta = abs(current_date - license_period.period_start_date)
|
||||
delta = abs(current_date - license_period.period_start)
|
||||
if delta > timedelta(days=current_app.config.get('ENTITLEMENTS_MAX_PENDING_DAYS', 5)):
|
||||
raise EveAIPendingLicensePeriod()
|
||||
case PeriodStatus.ACTIVE:
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
# common/utils/filters.py
|
||||
|
||||
import pytz
|
||||
import markdown
|
||||
from markupsafe import Markup
|
||||
from datetime import datetime
|
||||
from common.utils.nginx_utils import prefixed_url_for as puf
|
||||
|
||||
@@ -43,6 +45,44 @@ def status_color(status_name):
|
||||
return colors.get(status_name, 'secondary')
|
||||
|
||||
|
||||
def render_markdown(text):
|
||||
"""
|
||||
Renders markdown to HTML using Python's markdown library.
|
||||
Includes common extensions for better rendering.
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
# Verwijder de triple backticks en markdown label
|
||||
text = clean_markdown(text)
|
||||
|
||||
# Render de markdown met extensies
|
||||
return Markup(markdown.markdown(text, extensions=[
|
||||
'markdown.extensions.fenced_code',
|
||||
'markdown.extensions.codehilite',
|
||||
'markdown.extensions.tables',
|
||||
'markdown.extensions.toc'
|
||||
]))
|
||||
|
||||
|
||||
def clean_markdown(text):
|
||||
"""
|
||||
Verwijdert triple backticks en markdown aanduiding uit de tekst
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
text = text.strip()
|
||||
if text.startswith("```markdown"):
|
||||
text = text[len("```markdown"):].strip()
|
||||
elif text.startswith("```"):
|
||||
text = text[3:].strip()
|
||||
if text.endswith("```"):
|
||||
text = text[:-3].strip()
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def prefixed_url_for(endpoint):
|
||||
return puf(endpoint)
|
||||
|
||||
@@ -55,5 +95,7 @@ def register_filters(app):
|
||||
app.jinja_env.filters['time_difference'] = time_difference
|
||||
app.jinja_env.filters['status_color'] = status_color
|
||||
app.jinja_env.filters['prefixed_url_for'] = prefixed_url_for
|
||||
app.jinja_env.filters['markdown'] = render_markdown
|
||||
app.jinja_env.filters['clean_markdown'] = clean_markdown
|
||||
|
||||
app.jinja_env.globals['prefixed_url_for'] = prefixed_url_for
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
{{ render_selectable_table(headers=["ID", "URL", "Object Name", "File Type", "Process.", "Proces. Start", "Proces. Finish", "Proces. Error"], rows=rows, selectable=True, id="versionsTable") }}
|
||||
<div class="form-group mt-3">
|
||||
<button type="submit" name="action" value="edit_document_version" class="btn btn-primary">Edit Document Version</button>
|
||||
<button type="submit" name="action" value="view_document_version_markdown" class="btn btn-danger">View Processed Document</button>
|
||||
<button type="submit" name="action" value="process_document_version" class="btn btn-danger">Process Document Version</button>
|
||||
</div>
|
||||
</form>
|
||||
@@ -21,4 +22,4 @@
|
||||
|
||||
{% block content_footer %}
|
||||
{{ render_pagination(pagination, 'document_bp.documents') }}
|
||||
{% endblock %}
|
||||
{% endblock %}
|
||||
|
||||
@@ -37,6 +37,7 @@
|
||||
<div class="form-group mt-3 d-flex justify-content-between">
|
||||
<div>
|
||||
<button type="submit" name="action" value="edit_document_version" class="btn btn-primary" onclick="return validateTableSelection('documentVersionsForm')">Edit Document Version</button>
|
||||
<button type="submit" name="action" value="view_document_version_markdown" class="btn btn-danger">View Processed Document</button>
|
||||
<button type="submit" name="action" value="process_document_version" class="btn btn-danger" onclick="return validateTableSelection('documentVersionsForm')">Process Document Version</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
115
eveai_app/templates/document/view_document_version_markdown.html
Normal file
115
eveai_app/templates/document/view_document_version_markdown.html
Normal file
@@ -0,0 +1,115 @@
|
||||
{% extends "base.html" %}
|
||||
{% block title %}Document Version Markdown{% endblock %}
|
||||
|
||||
{% block content_title %}Document Version Markdown{% endblock %}
|
||||
{% block content_description %}Markdown inhoud van document versie {{ document_version.id }}.{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<div class="container mt-5">
|
||||
<div class="card">
|
||||
<div class="card-header bg-light">
|
||||
<h5>Document Informatie</h5>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<p><strong>Document ID:</strong> {{ document_version.doc_id }}</p>
|
||||
<p><strong>Versie ID:</strong> {{ document_version.id }}</p>
|
||||
<p><strong>Object Naam:</strong> {{ document_version.object_name }}</p>
|
||||
<p><strong>Taal:</strong> {{ document_version.language }}</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card mt-4">
|
||||
<div class="card-header bg-light d-flex justify-content-between align-items-center">
|
||||
<h5>Markdown Content</h5>
|
||||
<div class="btn-group" role="group">
|
||||
<button class="btn btn-sm btn-outline-secondary" id="showRaw">Toon ruw</button>
|
||||
<button class="btn btn-sm btn-outline-primary active" id="showRendered">Toon gerenderd</button>
|
||||
</div>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<!-- Ruwe markdown weergave (standaard verborgen) -->
|
||||
<div id="rawMarkdown" class="code-wrapper" style="display: none;">
|
||||
<pre><code class="language-markdown">{{ markdown_content }}</code></pre>
|
||||
</div>
|
||||
|
||||
<!-- Gerenderde markdown weergave -->
|
||||
<div id="renderedMarkdown" class="markdown-body">
|
||||
{{ markdown_content | markdown }}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endblock %}
|
||||
|
||||
{% block styles %}
|
||||
{{ super() }}
|
||||
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/github-markdown-css@4.0.0/github-markdown.min.css">
|
||||
<style>
|
||||
pre, code {
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
white-space: pre-wrap !important;
|
||||
word-wrap: break-word !important;
|
||||
max-width: 100%;
|
||||
}
|
||||
|
||||
pre code {
|
||||
padding: 1rem !important;
|
||||
border-radius: 4px;
|
||||
font-size: 0.75rem;
|
||||
line-height: 1.5;
|
||||
white-space: pre-wrap !important;
|
||||
}
|
||||
|
||||
.code-wrapper {
|
||||
position: relative;
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
.markdown-body {
|
||||
padding: 1rem;
|
||||
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif;
|
||||
line-height: 1.6;
|
||||
}
|
||||
|
||||
/* Zorg ervoor dat de markdown goed uitziet in de donkere modus van de app (optioneel) */
|
||||
@media (prefers-color-scheme: dark) {
|
||||
.markdown-body {
|
||||
color: #c9d1d9;
|
||||
background-color: #0d1117;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
{% endblock %}
|
||||
|
||||
{% block scripts %}
|
||||
{{ super() }}
|
||||
<script>
|
||||
document.addEventListener('DOMContentLoaded', function() {
|
||||
// Initialize syntax highlighting
|
||||
document.querySelectorAll('pre code').forEach((block) => {
|
||||
hljs.highlightElement(block);
|
||||
});
|
||||
|
||||
// Schakelknoppen voor weergave
|
||||
const showRawBtn = document.getElementById('showRaw');
|
||||
const showRenderedBtn = document.getElementById('showRendered');
|
||||
const rawMarkdown = document.getElementById('rawMarkdown');
|
||||
const renderedMarkdown = document.getElementById('renderedMarkdown');
|
||||
|
||||
showRawBtn.addEventListener('click', function() {
|
||||
rawMarkdown.style.display = 'block';
|
||||
renderedMarkdown.style.display = 'none';
|
||||
showRawBtn.classList.add('active');
|
||||
showRenderedBtn.classList.remove('active');
|
||||
});
|
||||
|
||||
showRenderedBtn.addEventListener('click', function() {
|
||||
rawMarkdown.style.display = 'none';
|
||||
renderedMarkdown.style.display = 'block';
|
||||
showRawBtn.classList.remove('active');
|
||||
showRenderedBtn.classList.add('active');
|
||||
});
|
||||
});
|
||||
</script>
|
||||
{% endblock %}
|
||||
@@ -12,7 +12,7 @@ from requests.exceptions import SSLError
|
||||
import json
|
||||
|
||||
from common.models.document import Document, DocumentVersion, Catalog, Retriever, Processor
|
||||
from common.extensions import db, cache_manager
|
||||
from common.extensions import db, cache_manager, minio_client
|
||||
from common.models.interaction import Specialist, SpecialistRetriever
|
||||
from common.utils.document_utils import create_document_stack, start_embedding_task, process_url, \
|
||||
edit_document, \
|
||||
@@ -215,7 +215,7 @@ def edit_processor(processor_id):
|
||||
try:
|
||||
db.session.add(processor)
|
||||
db.session.commit()
|
||||
flash('Retriever updated successfully!', 'success')
|
||||
flash('Processor updated successfully!', 'success')
|
||||
current_app.logger.info(f'Processor {processor.id} updated successfully')
|
||||
except SQLAlchemyError as e:
|
||||
db.session.rollback()
|
||||
@@ -649,6 +649,7 @@ def handle_document_version_selection():
|
||||
return redirect(prefixed_url_for('document_bp.document_versions_list'))
|
||||
|
||||
action = request.form['action']
|
||||
current_app.logger.debug(f'Action: {action}')
|
||||
|
||||
match action:
|
||||
case 'edit_document_version':
|
||||
@@ -656,6 +657,9 @@ def handle_document_version_selection():
|
||||
case 'process_document_version':
|
||||
process_version(doc_vers_id)
|
||||
# Add more conditions for other actions
|
||||
case 'view_document_version_markdown':
|
||||
return redirect(prefixed_url_for('document_bp.view_document_version_markdown',
|
||||
document_version_id=doc_vers_id))
|
||||
|
||||
doc_vers = DocumentVersion.query.get_or_404(doc_vers_id)
|
||||
return redirect(prefixed_url_for('document_bp.document_versions', document_id=doc_vers.doc_id))
|
||||
@@ -772,6 +776,45 @@ def document_versions_list():
|
||||
return view.get()
|
||||
|
||||
|
||||
@document_bp.route('/view_document_version_markdown/<int:document_version_id>', methods=['GET'])
|
||||
@roles_accepted('Super User', 'Partner Admin', 'Tenant Admin')
|
||||
def view_document_version_markdown(document_version_id):
|
||||
current_app.logger.debug(f'Viewing document version markdown {document_version_id}')
|
||||
# Retrieve document version
|
||||
document_version = DocumentVersion.query.get_or_404(document_version_id)
|
||||
|
||||
# retrieve tenant information
|
||||
tenant_id = session.get('tenant').get('id')
|
||||
|
||||
try:
|
||||
# Generate markdown filename
|
||||
markdown_filename = f"{document_version.id}.md"
|
||||
markdown_object_name = minio_client.generate_object_name(document_version.doc_id, document_version.language,
|
||||
document_version.id, markdown_filename)
|
||||
current_app.logger.debug(f'Markdown object name: {markdown_object_name}')
|
||||
# Download actual markdown file
|
||||
file_data = minio_client.download_document_file(
|
||||
tenant_id,
|
||||
document_version.bucket_name,
|
||||
markdown_object_name,
|
||||
)
|
||||
|
||||
# Decodeer de binaire data naar UTF-8 tekst
|
||||
markdown_content = file_data.decode('utf-8')
|
||||
current_app.logger.debug(f'Markdown content: {markdown_content}')
|
||||
|
||||
# Render de template met de markdown inhoud
|
||||
return render_template(
|
||||
'document/view_document_version_markdown.html',
|
||||
document_version=document_version,
|
||||
markdown_content=markdown_content
|
||||
)
|
||||
except Exception as e:
|
||||
current_app.logger.error(f"Error retrieving markdown for document version {document_version_id}: {str(e)}")
|
||||
flash(f"Error retrieving processed document: {str(e)}", "danger")
|
||||
return redirect(prefixed_url_for('document_bp.document_versions'))
|
||||
|
||||
|
||||
def refresh_all_documents():
|
||||
for doc in Document.query.all():
|
||||
refresh_document(doc.id)
|
||||
@@ -842,3 +885,16 @@ def fetch_html(url):
|
||||
|
||||
response.raise_for_status() # Will raise an exception for bad requests
|
||||
return response.content
|
||||
|
||||
|
||||
def clean_markdown(markdown):
|
||||
"""Functie die triple backticks uit markdown verwijdert"""
|
||||
markdown = markdown.strip()
|
||||
if markdown.startswith("```markdown"):
|
||||
markdown = markdown[len("```markdown"):].strip()
|
||||
elif markdown.startswith("```"):
|
||||
markdown = markdown[3:].strip()
|
||||
if markdown.endswith("```"):
|
||||
markdown = markdown[:-3].strip()
|
||||
return markdown
|
||||
|
||||
|
||||
@@ -106,7 +106,7 @@ class HTMLProcessor(BaseProcessor):
|
||||
current_chunk = []
|
||||
current_size = 0
|
||||
|
||||
for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span', 'table']):
|
||||
for element in soup.find_all(self.html_tags):
|
||||
element_html = str(element)
|
||||
element_size = len(element_html)
|
||||
|
||||
|
||||
@@ -91,4 +91,5 @@ contextvars~=2.4
|
||||
pandas~=2.2.3
|
||||
prometheus_client~=0.21.1
|
||||
scaleway~=2.9.0
|
||||
html2text~=2025.4.15
|
||||
html2text~=2025.4.15
|
||||
markdown~=3.8
|
||||
|
||||
Reference in New Issue
Block a user