- Addition of general chunking parameters chunking_heading_level and chunking patterns

- Addition of Processor types docx and markdown
This commit is contained in:
Josako
2024-12-05 15:19:37 +01:00
parent 311927d5ea
commit d35ec9f5ae
17 changed files with 718 additions and 66 deletions

View File

@@ -11,9 +11,10 @@ from common.utils.document_utils import (
create_document_stack, process_url, start_embedding_task,
validate_file_type, EveAIInvalidLanguageException, EveAIDoubleURLException, EveAIUnsupportedFileType,
get_documents_list, edit_document, refresh_document, edit_document_version,
refresh_document_with_info
refresh_document_with_info, lookup_document
)
from common.utils.eveai_exceptions import EveAIException
from eveai_api.api.auth import requires_service
def validate_date(date_str):
@@ -59,6 +60,7 @@ add_document_response = document_ns.model('AddDocumentResponse', {
@document_ns.route('/add_document')
class AddDocument(Resource):
@jwt_required()
@requires_service('DOCAPI')
@document_ns.expect(upload_parser)
@document_ns.response(201, 'Document added successfully', add_document_response)
@document_ns.response(400, 'Validation Error')
@@ -134,6 +136,7 @@ add_url_response = document_ns.model('AddURLResponse', {
@document_ns.route('/add_url')
class AddURL(Resource):
@jwt_required()
@requires_service('DOCAPI')
@document_ns.expect(add_url_model)
@document_ns.response(201, 'Document added successfully', add_url_response)
@document_ns.response(400, 'Validation Error')
@@ -190,6 +193,7 @@ document_list_model = document_ns.model('DocumentList', {
@document_ns.route('/list')
class DocumentList(Resource):
@jwt_required()
@requires_service('DOCAPI')
@document_ns.doc('list_documents')
@document_ns.marshal_list_with(document_list_model, envelope='documents')
def get(self):
@@ -210,6 +214,7 @@ edit_document_model = document_ns.model('EditDocument', {
@document_ns.route('/<int:document_id>')
class DocumentResource(Resource):
@jwt_required()
@requires_service('DOCAPI')
@document_ns.doc('edit_document')
@document_ns.expect(edit_document_model)
@document_ns.response(200, 'Document updated successfully')
@@ -232,6 +237,7 @@ class DocumentResource(Resource):
return e.to_dict(), e.status_code
@jwt_required()
@requires_service('DOCAPI')
@document_ns.doc('refresh_document')
@document_ns.response(200, 'Document refreshed successfully')
def post(self, document_id):
@@ -253,6 +259,7 @@ edit_document_version_model = document_ns.model('EditDocumentVersion', {
@document_ns.route('/version/<int:version_id>')
class DocumentVersionResource(Resource):
@jwt_required()
@requires_service('DOCAPI')
@document_ns.doc('edit_document_version')
@document_ns.expect(edit_document_version_model)
@document_ns.response(200, 'Document version updated successfully')
@@ -280,6 +287,7 @@ refresh_document_model = document_ns.model('RefreshDocument', {
@document_ns.route('/<int:document_id>/refresh')
class RefreshDocument(Resource):
@jwt_required()
@requires_service('DOCAPI')
@document_ns.response(200, 'Document refreshed successfully')
@document_ns.response(404, 'Document not found')
def post(self, document_id):
@@ -310,6 +318,7 @@ class RefreshDocument(Resource):
@document_ns.route('/<int:document_id>/refresh_with_info')
class RefreshDocumentWithInfo(Resource):
@jwt_required()
@requires_service('DOCAPI')
@document_ns.expect(refresh_document_model)
@document_ns.response(200, 'Document refreshed successfully')
@document_ns.response(400, 'Validation Error')
@@ -338,3 +347,112 @@ class RefreshDocumentWithInfo(Resource):
except Exception as e:
current_app.logger.error(f'Error refreshing document with info: {str(e)}')
return {'message': 'Internal server error'}, 500
# Define models for lookup requests
lookup_model = document_ns.model('DocumentLookup', {
'lookup_criteria': fields.Raw(required=True,
description='JSON object containing key-value pairs to match in metadata. '
'Example: {"external_id": "123", "source": "zapier", "source_type": "google_docs"}'),
'metadata_type': fields.String(required=True, enum=['user_metadata', 'system_metadata'],
description='Which metadata field to search in')
})
lookup_response = document_ns.model('DocumentLookupResponse', {
'document_id': fields.Integer(description='ID of the found document'),
'document_version_id': fields.Integer(description='ID of the latest document version'),
'name': fields.String(description='Document name'),
'metadata': fields.Raw(description='Full metadata of the found document')
})
@document_ns.route('/lookup')
class DocumentLookup(Resource):
@jwt_required()
@requires_service('DOCAPI')
@document_ns.expect(lookup_model)
@document_ns.marshal_with(lookup_response)
@document_ns.response(200, 'Document found', lookup_response)
@document_ns.response(404, 'No document found matching criteria')
def post(self):
"""
Look up a document using metadata criteria
"""
tenant_id = get_jwt_identity()
try:
data = request.json
document, version = lookup_document(
tenant_id,
data['lookup_criteria'],
data['metadata_type']
)
return {
'document_id': document.id,
'document_version_id': version.id,
'name': document.name,
'metadata': getattr(version, data['metadata_type'])
}
except EveAIException as e:
return e.to_dict(), e.status_code
except KeyError as e:
return {'message': f'Missing required field: {str(e)}'}, 400
refresh_content_model = document_ns.model('RefreshDocumentContent', {
'file_content': fields.Raw(required=True, description='The new file content'),
'language': fields.String(required=False, description='Language of the document'),
'user_context': fields.String(required=False, description='User context for the document'),
'user_metadata': fields.Raw(required=False, description='Custom metadata fields'),
'catalog_properties': fields.Raw(required=False, description='Catalog-specific properties'),
'trigger_service': fields.String(required=False, description='Service that triggered the update')
})
@document_ns.route('/<int:document_id>/refresh_content')
class RefreshDocumentContent(Resource):
@jwt_required()
@requires_service('DOCAPI')
@document_ns.expect(refresh_content_model)
@document_ns.response(200, 'Document refreshed successfully')
def post(self, document_id):
"""Refresh a document with new content"""
tenant_id = get_jwt_identity()
try:
data = request.json
file_content = data['file_content']
# Build user_metadata by merging:
# 1. Existing metadata (if any)
# 2. New metadata from request
# 3. Zapier-specific fields
user_metadata = data.get('user_metadata', {})
user_metadata.update({
'source': 'zapier',
'trigger_service': data.get('trigger_service')
})
data['user_metadata'] = user_metadata
# Keep catalog_properties separate
if 'catalog_properties' in data:
# We could add validation here against catalog configuration
data['catalog_properties'] = data['catalog_properties']
new_version, task_id = refresh_document_with_content(
document_id,
tenant_id,
file_content,
data
)
return {
'message': f'Document refreshed successfully. New version: {new_version.id}. Task ID: {task_id}',
'document_id': document_id,
'document_version_id': new_version.id,
'task_id': task_id
}, 200
except EveAIException as e:
return e.to_dict(), e.status_code