Added excluded element classes to HTML parsing to allow for more complex document parsing

Added chunking to conversion of HTML to markdown in case of large files
2024-08-22 16:41:13 +02:00
parent a9f9b04117
commit 2ca006d82c
10 changed files with 181 additions and 46 deletions
--- a/eveai_app/views/user_forms.py
+++ b/eveai_app/views/user_forms.py
@@ -32,6 +32,7 @@ class TenantForm(FlaskForm):
                                default='p, li')
    html_included_elements = StringField('HTML Included Elements', validators=[Optional()])
    html_excluded_elements = StringField('HTML Excluded Elements', validators=[Optional()])
+    html_excluded_classes = StringField('HTML Excluded Classes', validators=[Optional()])
    min_chunk_size = IntegerField('Minimum Chunk Size (2000)', validators=[NumberRange(min=0), Optional()], default=2000)
    max_chunk_size = IntegerField('Maximum Chunk Size (3000)', validators=[NumberRange(min=0), Optional()], default=3000)
    # Embedding Search variables