Added excluded element classes to HTML parsing to allow for more complex document parsing
Added chunking to conversion of HTML to markdown in case of large files
This commit is contained in:
@@ -32,6 +32,7 @@ class TenantForm(FlaskForm):
|
||||
default='p, li')
|
||||
html_included_elements = StringField('HTML Included Elements', validators=[Optional()])
|
||||
html_excluded_elements = StringField('HTML Excluded Elements', validators=[Optional()])
|
||||
html_excluded_classes = StringField('HTML Excluded Classes', validators=[Optional()])
|
||||
min_chunk_size = IntegerField('Minimum Chunk Size (2000)', validators=[NumberRange(min=0), Optional()], default=2000)
|
||||
max_chunk_size = IntegerField('Maximum Chunk Size (3000)', validators=[NumberRange(min=0), Optional()], default=3000)
|
||||
# Embedding Search variables
|
||||
|
||||
@@ -68,6 +68,8 @@ def tenant():
|
||||
if form.html_included_elements.data else []
|
||||
new_tenant.html_excluded_elements = [tag.strip() for tag in form.html_excluded_elements.data.split(',')] \
|
||||
if form.html_excluded_elements.data else []
|
||||
new_tenant.html_excluded_classes = [cls.strip() for cls in form.html_excluded_classes.data.split(',')] \
|
||||
if form.html_excluded_classes.data else []
|
||||
|
||||
current_app.logger.debug(f'html_tags: {new_tenant.html_tags},'
|
||||
f'html_end_tags: {new_tenant.html_end_tags},'
|
||||
@@ -123,6 +125,8 @@ def edit_tenant(tenant_id):
|
||||
form.html_included_elements.data = ', '.join(tenant.html_included_elements)
|
||||
if tenant.html_excluded_elements:
|
||||
form.html_excluded_elements.data = ', '.join(tenant.html_excluded_elements)
|
||||
if tenant.html_excluded_classes:
|
||||
form.html_excluded_classes.data = ', '.join(tenant.html_excluded_classes)
|
||||
|
||||
if form.validate_on_submit():
|
||||
# Populate the tenant with form data
|
||||
@@ -134,6 +138,8 @@ def edit_tenant(tenant_id):
|
||||
elem.strip()]
|
||||
tenant.html_excluded_elements = [elem.strip() for elem in form.html_excluded_elements.data.split(',') if
|
||||
elem.strip()]
|
||||
tenant.html_excluded_classes = [elem.strip() for elem in form.html_excluded_classes.data.split(',') if
|
||||
elem.strip()]
|
||||
|
||||
db.session.commit()
|
||||
flash('Tenant updated successfully.', 'success')
|
||||
|
||||
Reference in New Issue
Block a user