Improve HTML Processing + Introduction of Processed File viewer
This commit is contained in:
@@ -68,7 +68,7 @@ class LicensePeriodServices:
|
|||||||
# Status is PENDING, so no prepaid payment received. There is no license period we can use.
|
# Status is PENDING, so no prepaid payment received. There is no license period we can use.
|
||||||
# We allow for a delay of 5 days before raising an exception.
|
# We allow for a delay of 5 days before raising an exception.
|
||||||
current_date = dt.now(tz.utc).date()
|
current_date = dt.now(tz.utc).date()
|
||||||
delta = abs(current_date - license_period.period_start_date)
|
delta = abs(current_date - license_period.period_start)
|
||||||
if delta > timedelta(days=current_app.config.get('ENTITLEMENTS_MAX_PENDING_DAYS', 5)):
|
if delta > timedelta(days=current_app.config.get('ENTITLEMENTS_MAX_PENDING_DAYS', 5)):
|
||||||
raise EveAIPendingLicensePeriod()
|
raise EveAIPendingLicensePeriod()
|
||||||
case PeriodStatus.ACTIVE:
|
case PeriodStatus.ACTIVE:
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
# common/utils/filters.py
|
# common/utils/filters.py
|
||||||
|
|
||||||
import pytz
|
import pytz
|
||||||
|
import markdown
|
||||||
|
from markupsafe import Markup
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from common.utils.nginx_utils import prefixed_url_for as puf
|
from common.utils.nginx_utils import prefixed_url_for as puf
|
||||||
|
|
||||||
@@ -43,6 +45,44 @@ def status_color(status_name):
|
|||||||
return colors.get(status_name, 'secondary')
|
return colors.get(status_name, 'secondary')
|
||||||
|
|
||||||
|
|
||||||
|
def render_markdown(text):
|
||||||
|
"""
|
||||||
|
Renders markdown to HTML using Python's markdown library.
|
||||||
|
Includes common extensions for better rendering.
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Verwijder de triple backticks en markdown label
|
||||||
|
text = clean_markdown(text)
|
||||||
|
|
||||||
|
# Render de markdown met extensies
|
||||||
|
return Markup(markdown.markdown(text, extensions=[
|
||||||
|
'markdown.extensions.fenced_code',
|
||||||
|
'markdown.extensions.codehilite',
|
||||||
|
'markdown.extensions.tables',
|
||||||
|
'markdown.extensions.toc'
|
||||||
|
]))
|
||||||
|
|
||||||
|
|
||||||
|
def clean_markdown(text):
|
||||||
|
"""
|
||||||
|
Verwijdert triple backticks en markdown aanduiding uit de tekst
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
text = text.strip()
|
||||||
|
if text.startswith("```markdown"):
|
||||||
|
text = text[len("```markdown"):].strip()
|
||||||
|
elif text.startswith("```"):
|
||||||
|
text = text[3:].strip()
|
||||||
|
if text.endswith("```"):
|
||||||
|
text = text[:-3].strip()
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
def prefixed_url_for(endpoint):
|
def prefixed_url_for(endpoint):
|
||||||
return puf(endpoint)
|
return puf(endpoint)
|
||||||
|
|
||||||
@@ -55,5 +95,7 @@ def register_filters(app):
|
|||||||
app.jinja_env.filters['time_difference'] = time_difference
|
app.jinja_env.filters['time_difference'] = time_difference
|
||||||
app.jinja_env.filters['status_color'] = status_color
|
app.jinja_env.filters['status_color'] = status_color
|
||||||
app.jinja_env.filters['prefixed_url_for'] = prefixed_url_for
|
app.jinja_env.filters['prefixed_url_for'] = prefixed_url_for
|
||||||
|
app.jinja_env.filters['markdown'] = render_markdown
|
||||||
|
app.jinja_env.filters['clean_markdown'] = clean_markdown
|
||||||
|
|
||||||
app.jinja_env.globals['prefixed_url_for'] = prefixed_url_for
|
app.jinja_env.globals['prefixed_url_for'] = prefixed_url_for
|
||||||
|
|||||||
@@ -13,6 +13,7 @@
|
|||||||
{{ render_selectable_table(headers=["ID", "URL", "Object Name", "File Type", "Process.", "Proces. Start", "Proces. Finish", "Proces. Error"], rows=rows, selectable=True, id="versionsTable") }}
|
{{ render_selectable_table(headers=["ID", "URL", "Object Name", "File Type", "Process.", "Proces. Start", "Proces. Finish", "Proces. Error"], rows=rows, selectable=True, id="versionsTable") }}
|
||||||
<div class="form-group mt-3">
|
<div class="form-group mt-3">
|
||||||
<button type="submit" name="action" value="edit_document_version" class="btn btn-primary">Edit Document Version</button>
|
<button type="submit" name="action" value="edit_document_version" class="btn btn-primary">Edit Document Version</button>
|
||||||
|
<button type="submit" name="action" value="view_document_version_markdown" class="btn btn-danger">View Processed Document</button>
|
||||||
<button type="submit" name="action" value="process_document_version" class="btn btn-danger">Process Document Version</button>
|
<button type="submit" name="action" value="process_document_version" class="btn btn-danger">Process Document Version</button>
|
||||||
</div>
|
</div>
|
||||||
</form>
|
</form>
|
||||||
|
|||||||
@@ -37,6 +37,7 @@
|
|||||||
<div class="form-group mt-3 d-flex justify-content-between">
|
<div class="form-group mt-3 d-flex justify-content-between">
|
||||||
<div>
|
<div>
|
||||||
<button type="submit" name="action" value="edit_document_version" class="btn btn-primary" onclick="return validateTableSelection('documentVersionsForm')">Edit Document Version</button>
|
<button type="submit" name="action" value="edit_document_version" class="btn btn-primary" onclick="return validateTableSelection('documentVersionsForm')">Edit Document Version</button>
|
||||||
|
<button type="submit" name="action" value="view_document_version_markdown" class="btn btn-danger">View Processed Document</button>
|
||||||
<button type="submit" name="action" value="process_document_version" class="btn btn-danger" onclick="return validateTableSelection('documentVersionsForm')">Process Document Version</button>
|
<button type="submit" name="action" value="process_document_version" class="btn btn-danger" onclick="return validateTableSelection('documentVersionsForm')">Process Document Version</button>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
115
eveai_app/templates/document/view_document_version_markdown.html
Normal file
115
eveai_app/templates/document/view_document_version_markdown.html
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
{% extends "base.html" %}
|
||||||
|
{% block title %}Document Version Markdown{% endblock %}
|
||||||
|
|
||||||
|
{% block content_title %}Document Version Markdown{% endblock %}
|
||||||
|
{% block content_description %}Markdown inhoud van document versie {{ document_version.id }}.{% endblock %}
|
||||||
|
|
||||||
|
{% block content %}
|
||||||
|
<div class="container mt-5">
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-header bg-light">
|
||||||
|
<h5>Document Informatie</h5>
|
||||||
|
</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<p><strong>Document ID:</strong> {{ document_version.doc_id }}</p>
|
||||||
|
<p><strong>Versie ID:</strong> {{ document_version.id }}</p>
|
||||||
|
<p><strong>Object Naam:</strong> {{ document_version.object_name }}</p>
|
||||||
|
<p><strong>Taal:</strong> {{ document_version.language }}</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="card mt-4">
|
||||||
|
<div class="card-header bg-light d-flex justify-content-between align-items-center">
|
||||||
|
<h5>Markdown Content</h5>
|
||||||
|
<div class="btn-group" role="group">
|
||||||
|
<button class="btn btn-sm btn-outline-secondary" id="showRaw">Toon ruw</button>
|
||||||
|
<button class="btn btn-sm btn-outline-primary active" id="showRendered">Toon gerenderd</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<!-- Ruwe markdown weergave (standaard verborgen) -->
|
||||||
|
<div id="rawMarkdown" class="code-wrapper" style="display: none;">
|
||||||
|
<pre><code class="language-markdown">{{ markdown_content }}</code></pre>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Gerenderde markdown weergave -->
|
||||||
|
<div id="renderedMarkdown" class="markdown-body">
|
||||||
|
{{ markdown_content | markdown }}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% endblock %}
|
||||||
|
|
||||||
|
{% block styles %}
|
||||||
|
{{ super() }}
|
||||||
|
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/github-markdown-css@4.0.0/github-markdown.min.css">
|
||||||
|
<style>
|
||||||
|
pre, code {
|
||||||
|
margin: 0;
|
||||||
|
padding: 0;
|
||||||
|
white-space: pre-wrap !important;
|
||||||
|
word-wrap: break-word !important;
|
||||||
|
max-width: 100%;
|
||||||
|
}
|
||||||
|
|
||||||
|
pre code {
|
||||||
|
padding: 1rem !important;
|
||||||
|
border-radius: 4px;
|
||||||
|
font-size: 0.75rem;
|
||||||
|
line-height: 1.5;
|
||||||
|
white-space: pre-wrap !important;
|
||||||
|
}
|
||||||
|
|
||||||
|
.code-wrapper {
|
||||||
|
position: relative;
|
||||||
|
width: 100%;
|
||||||
|
}
|
||||||
|
|
||||||
|
.markdown-body {
|
||||||
|
padding: 1rem;
|
||||||
|
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif;
|
||||||
|
line-height: 1.6;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Zorg ervoor dat de markdown goed uitziet in de donkere modus van de app (optioneel) */
|
||||||
|
@media (prefers-color-scheme: dark) {
|
||||||
|
.markdown-body {
|
||||||
|
color: #c9d1d9;
|
||||||
|
background-color: #0d1117;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
{% endblock %}
|
||||||
|
|
||||||
|
{% block scripts %}
|
||||||
|
{{ super() }}
|
||||||
|
<script>
|
||||||
|
document.addEventListener('DOMContentLoaded', function() {
|
||||||
|
// Initialize syntax highlighting
|
||||||
|
document.querySelectorAll('pre code').forEach((block) => {
|
||||||
|
hljs.highlightElement(block);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Schakelknoppen voor weergave
|
||||||
|
const showRawBtn = document.getElementById('showRaw');
|
||||||
|
const showRenderedBtn = document.getElementById('showRendered');
|
||||||
|
const rawMarkdown = document.getElementById('rawMarkdown');
|
||||||
|
const renderedMarkdown = document.getElementById('renderedMarkdown');
|
||||||
|
|
||||||
|
showRawBtn.addEventListener('click', function() {
|
||||||
|
rawMarkdown.style.display = 'block';
|
||||||
|
renderedMarkdown.style.display = 'none';
|
||||||
|
showRawBtn.classList.add('active');
|
||||||
|
showRenderedBtn.classList.remove('active');
|
||||||
|
});
|
||||||
|
|
||||||
|
showRenderedBtn.addEventListener('click', function() {
|
||||||
|
rawMarkdown.style.display = 'none';
|
||||||
|
renderedMarkdown.style.display = 'block';
|
||||||
|
showRawBtn.classList.remove('active');
|
||||||
|
showRenderedBtn.classList.add('active');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
</script>
|
||||||
|
{% endblock %}
|
||||||
@@ -12,7 +12,7 @@ from requests.exceptions import SSLError
|
|||||||
import json
|
import json
|
||||||
|
|
||||||
from common.models.document import Document, DocumentVersion, Catalog, Retriever, Processor
|
from common.models.document import Document, DocumentVersion, Catalog, Retriever, Processor
|
||||||
from common.extensions import db, cache_manager
|
from common.extensions import db, cache_manager, minio_client
|
||||||
from common.models.interaction import Specialist, SpecialistRetriever
|
from common.models.interaction import Specialist, SpecialistRetriever
|
||||||
from common.utils.document_utils import create_document_stack, start_embedding_task, process_url, \
|
from common.utils.document_utils import create_document_stack, start_embedding_task, process_url, \
|
||||||
edit_document, \
|
edit_document, \
|
||||||
@@ -215,7 +215,7 @@ def edit_processor(processor_id):
|
|||||||
try:
|
try:
|
||||||
db.session.add(processor)
|
db.session.add(processor)
|
||||||
db.session.commit()
|
db.session.commit()
|
||||||
flash('Retriever updated successfully!', 'success')
|
flash('Processor updated successfully!', 'success')
|
||||||
current_app.logger.info(f'Processor {processor.id} updated successfully')
|
current_app.logger.info(f'Processor {processor.id} updated successfully')
|
||||||
except SQLAlchemyError as e:
|
except SQLAlchemyError as e:
|
||||||
db.session.rollback()
|
db.session.rollback()
|
||||||
@@ -649,6 +649,7 @@ def handle_document_version_selection():
|
|||||||
return redirect(prefixed_url_for('document_bp.document_versions_list'))
|
return redirect(prefixed_url_for('document_bp.document_versions_list'))
|
||||||
|
|
||||||
action = request.form['action']
|
action = request.form['action']
|
||||||
|
current_app.logger.debug(f'Action: {action}')
|
||||||
|
|
||||||
match action:
|
match action:
|
||||||
case 'edit_document_version':
|
case 'edit_document_version':
|
||||||
@@ -656,6 +657,9 @@ def handle_document_version_selection():
|
|||||||
case 'process_document_version':
|
case 'process_document_version':
|
||||||
process_version(doc_vers_id)
|
process_version(doc_vers_id)
|
||||||
# Add more conditions for other actions
|
# Add more conditions for other actions
|
||||||
|
case 'view_document_version_markdown':
|
||||||
|
return redirect(prefixed_url_for('document_bp.view_document_version_markdown',
|
||||||
|
document_version_id=doc_vers_id))
|
||||||
|
|
||||||
doc_vers = DocumentVersion.query.get_or_404(doc_vers_id)
|
doc_vers = DocumentVersion.query.get_or_404(doc_vers_id)
|
||||||
return redirect(prefixed_url_for('document_bp.document_versions', document_id=doc_vers.doc_id))
|
return redirect(prefixed_url_for('document_bp.document_versions', document_id=doc_vers.doc_id))
|
||||||
@@ -772,6 +776,45 @@ def document_versions_list():
|
|||||||
return view.get()
|
return view.get()
|
||||||
|
|
||||||
|
|
||||||
|
@document_bp.route('/view_document_version_markdown/<int:document_version_id>', methods=['GET'])
|
||||||
|
@roles_accepted('Super User', 'Partner Admin', 'Tenant Admin')
|
||||||
|
def view_document_version_markdown(document_version_id):
|
||||||
|
current_app.logger.debug(f'Viewing document version markdown {document_version_id}')
|
||||||
|
# Retrieve document version
|
||||||
|
document_version = DocumentVersion.query.get_or_404(document_version_id)
|
||||||
|
|
||||||
|
# retrieve tenant information
|
||||||
|
tenant_id = session.get('tenant').get('id')
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Generate markdown filename
|
||||||
|
markdown_filename = f"{document_version.id}.md"
|
||||||
|
markdown_object_name = minio_client.generate_object_name(document_version.doc_id, document_version.language,
|
||||||
|
document_version.id, markdown_filename)
|
||||||
|
current_app.logger.debug(f'Markdown object name: {markdown_object_name}')
|
||||||
|
# Download actual markdown file
|
||||||
|
file_data = minio_client.download_document_file(
|
||||||
|
tenant_id,
|
||||||
|
document_version.bucket_name,
|
||||||
|
markdown_object_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Decodeer de binaire data naar UTF-8 tekst
|
||||||
|
markdown_content = file_data.decode('utf-8')
|
||||||
|
current_app.logger.debug(f'Markdown content: {markdown_content}')
|
||||||
|
|
||||||
|
# Render de template met de markdown inhoud
|
||||||
|
return render_template(
|
||||||
|
'document/view_document_version_markdown.html',
|
||||||
|
document_version=document_version,
|
||||||
|
markdown_content=markdown_content
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
current_app.logger.error(f"Error retrieving markdown for document version {document_version_id}: {str(e)}")
|
||||||
|
flash(f"Error retrieving processed document: {str(e)}", "danger")
|
||||||
|
return redirect(prefixed_url_for('document_bp.document_versions'))
|
||||||
|
|
||||||
|
|
||||||
def refresh_all_documents():
|
def refresh_all_documents():
|
||||||
for doc in Document.query.all():
|
for doc in Document.query.all():
|
||||||
refresh_document(doc.id)
|
refresh_document(doc.id)
|
||||||
@@ -842,3 +885,16 @@ def fetch_html(url):
|
|||||||
|
|
||||||
response.raise_for_status() # Will raise an exception for bad requests
|
response.raise_for_status() # Will raise an exception for bad requests
|
||||||
return response.content
|
return response.content
|
||||||
|
|
||||||
|
|
||||||
|
def clean_markdown(markdown):
|
||||||
|
"""Functie die triple backticks uit markdown verwijdert"""
|
||||||
|
markdown = markdown.strip()
|
||||||
|
if markdown.startswith("```markdown"):
|
||||||
|
markdown = markdown[len("```markdown"):].strip()
|
||||||
|
elif markdown.startswith("```"):
|
||||||
|
markdown = markdown[3:].strip()
|
||||||
|
if markdown.endswith("```"):
|
||||||
|
markdown = markdown[:-3].strip()
|
||||||
|
return markdown
|
||||||
|
|
||||||
|
|||||||
@@ -106,7 +106,7 @@ class HTMLProcessor(BaseProcessor):
|
|||||||
current_chunk = []
|
current_chunk = []
|
||||||
current_size = 0
|
current_size = 0
|
||||||
|
|
||||||
for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span', 'table']):
|
for element in soup.find_all(self.html_tags):
|
||||||
element_html = str(element)
|
element_html = str(element)
|
||||||
element_size = len(element_html)
|
element_size = len(element_html)
|
||||||
|
|
||||||
|
|||||||
@@ -92,3 +92,4 @@ pandas~=2.2.3
|
|||||||
prometheus_client~=0.21.1
|
prometheus_client~=0.21.1
|
||||||
scaleway~=2.9.0
|
scaleway~=2.9.0
|
||||||
html2text~=2025.4.15
|
html2text~=2025.4.15
|
||||||
|
markdown~=3.8
|
||||||
|
|||||||
Reference in New Issue
Block a user