Added excluded element classes to HTML parsing to allow for more complex document parsing

Added chunking to conversion of HTML to markdown in case of large files
This commit is contained in:
Josako
2024-08-22 16:41:13 +02:00
parent a9f9b04117
commit 2ca006d82c
10 changed files with 181 additions and 46 deletions

View File

@@ -32,6 +32,7 @@ class TenantForm(FlaskForm):
default='p, li')
html_included_elements = StringField('HTML Included Elements', validators=[Optional()])
html_excluded_elements = StringField('HTML Excluded Elements', validators=[Optional()])
html_excluded_classes = StringField('HTML Excluded Classes', validators=[Optional()])
min_chunk_size = IntegerField('Minimum Chunk Size (2000)', validators=[NumberRange(min=0), Optional()], default=2000)
max_chunk_size = IntegerField('Maximum Chunk Size (3000)', validators=[NumberRange(min=0), Optional()], default=3000)
# Embedding Search variables