- Addition of general chunking parameters chunking_heading_level and chunking patterns

- Addition of Processor types docx and markdown
This commit is contained in:
Josako
2024-12-05 15:19:37 +01:00
parent 311927d5ea
commit d35ec9f5ae
17 changed files with 718 additions and 66 deletions

View File

@@ -5,7 +5,46 @@ import json
from wtforms.fields.choices import SelectField
from wtforms.fields.datetime import DateField
from common.utils.config_field_types import TaggingFields
from common.utils.config_field_types import TaggingFields, json_to_patterns, patterns_to_json
class TaggingFieldsField(TextAreaField):
def __init__(self, *args, **kwargs):
kwargs['render_kw'] = {
'class': 'chunking-patterns-field',
'data-handle-enter': 'true'
}
super().__init__(*args, **kwargs)
# def _value(self):
# if self.data:
# return json.dumps(self.data)
# return ''
#
# def process_formdata(self, valuelist):
# if valuelist and valuelist[0]:
# try:
# self.data = json.loads(valuelist[0])
# except json.JSONDecodeError as e:
# raise ValueError('Not valid JSON content')
class ChunkingPatternsField(TextAreaField):
def __init__(self, *args, **kwargs):
kwargs['render_kw'] = {
'class': 'chunking-patterns-field',
'data-handle-enter': 'true'
}
super().__init__(*args, **kwargs)
# def _value(self):
# if self.data:
# return '\n'.join(self.data)
# return ''
#
# def process_formdata(self, valuelist):
# if valuelist and valuelist[0]:
# self.data = [line.strip() for line in valuelist[0].split('\n') if line.strip()]
class DynamicFormBase(FlaskForm):
@@ -80,7 +119,7 @@ class DynamicFormBase(FlaskForm):
# Handle special case for tagging_fields
if field_type == 'tagging_fields':
field_class = TextAreaField
field_class = TaggingFieldsField
extra_classes = 'json-editor'
field_kwargs = {}
elif field_type == 'enum':
@@ -89,6 +128,10 @@ class DynamicFormBase(FlaskForm):
choices = [(str(val), str(val)) for val in allowed_values]
extra_classes = ''
field_kwargs = {'choices': choices}
elif field_type == 'chunking_patterns':
field_class = ChunkingPatternsField
extra_classes = ['monospace-text', 'pattern-input']
field_kwargs = {}
else:
extra_classes = ''
field_class = {
@@ -111,6 +154,12 @@ class DynamicFormBase(FlaskForm):
except (TypeError, ValueError) as e:
current_app.logger.error(f"Error converting initial data to JSON: {e}")
field_data = "{}"
elif field_type == 'chunking_patterns':
try:
field_data = json_to_patterns(field_data)
except (TypeError, ValueError) as e:
current_app.logger.error(f"Error converting initial data to a list of patterns: {e}")
field_data = {}
elif default is not None:
field_data = default
@@ -173,12 +222,17 @@ class DynamicFormBase(FlaskForm):
original_field_name = full_field_name[prefix_length:]
field = getattr(self, full_field_name)
# Parse JSON for tagging_fields type
if isinstance(field, TextAreaField) and field.data:
if isinstance(field, TaggingFieldsField) and field.data:
try:
data[original_field_name] = json.loads(field.data)
except json.JSONDecodeError:
# Validation should catch this, but just in case
data[original_field_name] = field.data
elif isinstance(field, ChunkingPatternsField):
try:
data[original_field_name] = patterns_to_json(field.data)
except Exception as e:
current_app.logger.error(f"Error converting initial data to patterns: {e}")
else:
data[original_field_name] = field.data
return data
@@ -230,5 +284,3 @@ def validate_tagging_fields(form, field):
except (TypeError, ValueError) as e:
raise ValidationError(f"Invalid field definition: {str(e)}")