- Addition of general chunking parameters chunking_heading_level and chunking patterns

- Addition of Processor types docx and markdown
This commit is contained in:
Josako
2024-12-05 15:19:37 +01:00
parent 311927d5ea
commit d35ec9f5ae
17 changed files with 718 additions and 66 deletions

View File

@@ -15,14 +15,6 @@ from config.type_defs.retriever_types import RETRIEVER_TYPES
from .dynamic_form_base import DynamicFormBase
def allowed_file(form, field):
if field.data:
filename = field.data.filename
allowed_extensions = current_app.config.get('SUPPORTED_FILE_TYPES', [])
if not ('.' in filename and filename.rsplit('.', 1)[1].lower() in allowed_extensions):
raise ValidationError('Unsupported file type.')
def validate_json(form, field):
if field.data:
try:
@@ -101,7 +93,10 @@ class ProcessorForm(FlaskForm):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Dynamically populate the 'type' field using the constructor
self.type.choices = [(key, value['name']) for key, value in PROCESSOR_TYPES.items()]
self.type.choices = sorted(
[(key, value['name']) for key, value in PROCESSOR_TYPES.items()],
key=lambda x: x[1],
)
class EditProcessorForm(DynamicFormBase):
@@ -177,7 +172,7 @@ class EditRetrieverForm(DynamicFormBase):
class AddDocumentForm(DynamicFormBase):
file = FileField('File', validators=[FileRequired(), allowed_file])
file = FileField('File', validators=[FileRequired()])
catalog = StringField('Catalog', render_kw={'readonly': True})
sub_file_type = StringField('Sub File Type', validators=[Optional(), Length(max=50)])
name = StringField('Name', validators=[Length(max=100)])

View File

@@ -14,7 +14,7 @@ import json
from common.models.document import Document, DocumentVersion, Catalog, Retriever, Processor
from common.extensions import db
from common.models.interaction import Specialist, SpecialistRetriever
from common.utils.document_utils import validate_file_type, create_document_stack, start_embedding_task, process_url, \
from common.utils.document_utils import create_document_stack, start_embedding_task, process_url, \
edit_document, \
edit_document_version, refresh_document
from common.utils.eveai_exceptions import EveAIInvalidLanguageException, EveAIUnsupportedFileType, \
@@ -391,9 +391,6 @@ def add_document():
sub_file_type = form.sub_file_type.data
filename = secure_filename(file.filename)
extension = filename.rsplit('.', 1)[1].lower()
validate_file_type(extension)
catalog_properties = {}
document_version_configurations = CATALOG_TYPES[catalog.type]['document_version_configurations']
for config in document_version_configurations:

View File

@@ -5,7 +5,46 @@ import json
from wtforms.fields.choices import SelectField
from wtforms.fields.datetime import DateField
from common.utils.config_field_types import TaggingFields
from common.utils.config_field_types import TaggingFields, json_to_patterns, patterns_to_json
class TaggingFieldsField(TextAreaField):
def __init__(self, *args, **kwargs):
kwargs['render_kw'] = {
'class': 'chunking-patterns-field',
'data-handle-enter': 'true'
}
super().__init__(*args, **kwargs)
# def _value(self):
# if self.data:
# return json.dumps(self.data)
# return ''
#
# def process_formdata(self, valuelist):
# if valuelist and valuelist[0]:
# try:
# self.data = json.loads(valuelist[0])
# except json.JSONDecodeError as e:
# raise ValueError('Not valid JSON content')
class ChunkingPatternsField(TextAreaField):
def __init__(self, *args, **kwargs):
kwargs['render_kw'] = {
'class': 'chunking-patterns-field',
'data-handle-enter': 'true'
}
super().__init__(*args, **kwargs)
# def _value(self):
# if self.data:
# return '\n'.join(self.data)
# return ''
#
# def process_formdata(self, valuelist):
# if valuelist and valuelist[0]:
# self.data = [line.strip() for line in valuelist[0].split('\n') if line.strip()]
class DynamicFormBase(FlaskForm):
@@ -80,7 +119,7 @@ class DynamicFormBase(FlaskForm):
# Handle special case for tagging_fields
if field_type == 'tagging_fields':
field_class = TextAreaField
field_class = TaggingFieldsField
extra_classes = 'json-editor'
field_kwargs = {}
elif field_type == 'enum':
@@ -89,6 +128,10 @@ class DynamicFormBase(FlaskForm):
choices = [(str(val), str(val)) for val in allowed_values]
extra_classes = ''
field_kwargs = {'choices': choices}
elif field_type == 'chunking_patterns':
field_class = ChunkingPatternsField
extra_classes = ['monospace-text', 'pattern-input']
field_kwargs = {}
else:
extra_classes = ''
field_class = {
@@ -111,6 +154,12 @@ class DynamicFormBase(FlaskForm):
except (TypeError, ValueError) as e:
current_app.logger.error(f"Error converting initial data to JSON: {e}")
field_data = "{}"
elif field_type == 'chunking_patterns':
try:
field_data = json_to_patterns(field_data)
except (TypeError, ValueError) as e:
current_app.logger.error(f"Error converting initial data to a list of patterns: {e}")
field_data = {}
elif default is not None:
field_data = default
@@ -173,12 +222,17 @@ class DynamicFormBase(FlaskForm):
original_field_name = full_field_name[prefix_length:]
field = getattr(self, full_field_name)
# Parse JSON for tagging_fields type
if isinstance(field, TextAreaField) and field.data:
if isinstance(field, TaggingFieldsField) and field.data:
try:
data[original_field_name] = json.loads(field.data)
except json.JSONDecodeError:
# Validation should catch this, but just in case
data[original_field_name] = field.data
elif isinstance(field, ChunkingPatternsField):
try:
data[original_field_name] = patterns_to_json(field.data)
except Exception as e:
current_app.logger.error(f"Error converting initial data to patterns: {e}")
else:
data[original_field_name] = field.data
return data
@@ -230,5 +284,3 @@ def validate_tagging_fields(form, field):
except (TypeError, ValueError) as e:
raise ValidationError(f"Invalid field definition: {str(e)}")