Added excluded element classes to HTML parsing to allow for more complex document parsing

Added chunking to conversion of HTML to markdown in case of large files
2024-08-22 16:41:13 +02:00
parent a9f9b04117
commit 2ca006d82c
10 changed files with 181 additions and 46 deletions
--- a/common/utils/model_utils.py
+++ b/common/utils/model_utils.py
@@ -86,6 +86,7 @@ def select_model_variables(tenant):
    model_variables['html_end_tags'] = tenant.html_end_tags
    model_variables['html_included_elements'] = tenant.html_included_elements
    model_variables['html_excluded_elements'] = tenant.html_excluded_elements
+    model_variables['html_excluded_classes'] = tenant.html_excluded_classes

    # Set Chunk Size variables
    model_variables['min_chunk_size'] = tenant.min_chunk_size