import io import json from datetime import datetime from typing import Tuple, Any import pytz import requests from flask import current_app, request from flask_restx import Namespace, Resource, fields, reqparse from flask_jwt_extended import jwt_required, get_jwt_identity from sqlalchemy import desc from werkzeug.datastructures import FileStorage from werkzeug.utils import secure_filename from common.models.document import DocumentVersion from common.utils.document_utils import ( create_document_stack, process_url, start_embedding_task, EveAIInvalidLanguageException, EveAIDoubleURLException, EveAIUnsupportedFileType, get_documents_list, edit_document, refresh_document, edit_document_version, refresh_document_with_info, lookup_document, refresh_document_with_content, clean_url ) from common.utils.eveai_exceptions import EveAIException from eveai_api.api.auth import requires_service def validate_date(date_str): try: return datetime.fromisoformat(date_str).replace(tzinfo=pytz.UTC) except ValueError: raise ValueError("Invalid date format. Use ISO format (YYYY-MM-DDTHH:MM:SS).") def validate_json(json_str): try: return json.loads(json_str) except json.JSONDecodeError: raise ValueError("Invalid JSON format for user_metadata.") document_ns = Namespace('documents', description='Document related operations') # Define models for request parsing and response serialization upload_parser = reqparse.RequestParser() upload_parser.add_argument('catalog_id', location='form', type=int, required=True, help='The catalog to add the file to') upload_parser.add_argument('file', location='files', type=FileStorage, required=True, help='The file to upload') upload_parser.add_argument('name', location='form', type=str, required=False, help='Name of the document') upload_parser.add_argument('language', location='form', type=str, required=True, help='Language of the document') upload_parser.add_argument('user_context', location='form', type=str, required=False, help='User context for the document') upload_parser.add_argument('valid_from', location='form', type=validate_date, required=False, help='Valid from date for the document (ISO format)') upload_parser.add_argument('user_metadata', location='form', type=validate_json, required=False, help='User metadata for the document (JSON format)') upload_parser.add_argument('catalog_properties', location='form', type=validate_json, required=False, help='The catalog configuration to be passed along (JSON format). Validity is against catalog requirements ' 'is not checked, and is the responsibility of the calling client.') add_document_response = document_ns.model('AddDocumentResponse', { 'message': fields.String(description='Status message'), 'document_id': fields.Integer(description='ID of the created document'), 'document_version_id': fields.Integer(description='ID of the created document version'), 'task_id': fields.String(description='ID of the embedding task') }) @document_ns.route('/add_document') class AddDocument(Resource): @jwt_required() @requires_service('DOCAPI') @document_ns.expect(upload_parser) @document_ns.response(201, 'Document added successfully', add_document_response) @document_ns.response(400, 'Validation Error') @document_ns.response(500, 'Internal Server Error') def post(self): """ Upload a new document to EveAI by directly providing the file content. This endpoint accepts multipart/form-data with the file content and metadata. It processes the file, creates a new document in the specified catalog, and initiates the embedding process. """ tenant_id = get_jwt_identity() current_app.logger.info(f'Adding document for tenant {tenant_id}') try: args = upload_parser.parse_args() except Exception as e: current_app.logger.error(f"Error parsing arguments: {str(e)}") current_app.logger.error(f"Exception type: {type(e)}") raise try: file = args['file'] filename = secure_filename(file.filename) extension = filename.rsplit('.', 1)[1].lower() # validate_file_type(extension) api_input = { 'catalog_id': args.get('catalog_id'), 'name': args.get('name') or filename, 'language': args.get('language'), 'user_context': args.get('user_context'), 'valid_from': args.get('valid_from'), 'user_metadata': args.get('user_metadata'), 'catalog_properties': args.get('catalog_properties'), } new_doc, new_doc_vers = create_document_stack(api_input, file, filename, extension, tenant_id) task_id = start_embedding_task(tenant_id, new_doc_vers.id) return { 'message': f'Processing on document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task_id}.', 'document_id': new_doc.id, 'document_version_id': new_doc_vers.id, 'task_id': task_id }, 201 except (EveAIInvalidLanguageException, EveAIUnsupportedFileType) as e: current_app.logger.error(f'Error adding document: {str(e)}') document_ns.abort(400, str(e)) except Exception as e: current_app.logger.error(f'Error adding document: {str(e)}') document_ns.abort(500, 'Error adding document') # Models for AddDocumentThroughURL add_document_through_url = document_ns.model('AddDocumentThroughURL', { 'catalog_id': fields.Integer(required=True, description='ID of the catalog the URL needs to be added to'), 'temp_url': fields.String(required=True, description='Temporary URL of the document to add'), 'name': fields.String(required=True, description='Name of the document'), 'language': fields.String(required=True, description='Language of the document'), 'user_context': fields.String(required=False, description='User context for the document'), 'valid_from': fields.String(required=False, description='Valid from date for the document'), 'user_metadata': fields.String(required=False, description='User metadata for the document'), 'system_metadata': fields.String(required=False, description='System metadata for the document'), 'catalog_properties': fields.String(required=False, description='The catalog configuration to be passed along (JSON ' 'format). Validity is against catalog requirements ' 'is not checked, and is the responsibility of the ' 'calling client.'), }) add_document_through_url_response = document_ns.model('AddDocumentThroughURLResponse', { 'message': fields.String(description='Status message'), 'document_id': fields.Integer(description='ID of the created document'), 'document_version_id': fields.Integer(description='ID of the created document version'), 'task_id': fields.String(description='ID of the embedding task') }) @document_ns.route('/add_document_through_url') class AddDocumentThroughURL(Resource): @jwt_required() @requires_service('DOCAPI') @document_ns.expect(add_document_through_url) @document_ns.response(201, 'Document added successfully', add_document_through_url) @document_ns.response(400, 'Validation Error') @document_ns.response(422, 'File could not be processed') @document_ns.response(500, 'Internal Server Error') def post(self): """ Add a new document to EveAI using a temporary URL. This endpoint is primarily used for integration with services that provide temporary URLs (like Zapier). The URL content is downloaded and processed as a new document. """ tenant_id = get_jwt_identity() current_app.logger.info(f'Adding document through url for tenant {tenant_id}') try: args = document_ns.payload except Exception as e: current_app.logger.error(f"Error parsing arguments: {str(e)}") current_app.logger.error(f"Exception type: {type(e)}") raise try: user_metadata = json.loads(args.get('user_metadata', '{}')) actual_file_content, actual_file_content_type = download_file_content(args['temp_url'], user_metadata) # Get filename from URL or use provided name filename = secure_filename(args.get('name')) extension = filename.rsplit('.', 1)[1].lower() if '.' in filename else '' # Create FileStorage object from downloaded content file_content = io.BytesIO(actual_file_content) file = FileStorage( stream=file_content, filename=filename, content_type=actual_file_content_type ) current_app.logger.info(f"Successfully downloaded file: {filename}") except requests.RequestException as e: current_app.logger.error(f"Error downloading file: {str(e)}") return {'message': f'Error downloading file: {str(e)}'}, 422 try: # Prepare API input api_input = { 'catalog_id': args.get('catalog_id'), 'name': args.get('name') or filename, 'language': args.get('language'), 'user_context': args.get('user_context'), 'valid_from': args.get('valid_from'), 'user_metadata': args.get('user_metadata'), 'catalog_properties': args.get('catalog_properties'), } new_doc, new_doc_vers = create_document_stack(api_input, file, filename, extension, tenant_id) task_id = start_embedding_task(tenant_id, new_doc_vers.id) return { 'message': f'Processing on document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task_id}.', 'document_id': new_doc.id, 'document_version_id': new_doc_vers.id, 'task_id': task_id }, 201 except (EveAIInvalidLanguageException, EveAIUnsupportedFileType) as e: current_app.logger.error(f'Error adding document: {str(e)}') return {'message': str(e)}, 400 except Exception as e: current_app.logger.error(f'Error adding document: {str(e)}') return {'message': 'Error adding document'}, 500 # Models for AddURL add_url_model = document_ns.model('AddURL', { 'catalog_id': fields.Integer(required='True', description='ID of the catalog the URL needs to be added to'), 'url': fields.String(required=True, description='URL of the document to add'), 'name': fields.String(required=False, description='Name of the document'), 'language': fields.String(required=True, description='Language of the document'), 'user_context': fields.String(required=False, description='User context for the document'), 'valid_from': fields.String(required=False, description='Valid from date for the document'), 'user_metadata': fields.String(required=False, description='User metadata for the document'), 'system_metadata': fields.String(required=False, description='System metadata for the document'), 'catalog_properties': fields.String(required=False, description='The catalog configuration to be passed along (JSON ' 'format). Validity is against catalog requirements ' 'is not checked, and is the responsibility of the ' 'calling client.'), }) add_url_response = document_ns.model('AddURLResponse', { 'message': fields.String(description='Status message'), 'document_id': fields.Integer(description='ID of the created document'), 'document_version_id': fields.Integer(description='ID of the created document version'), 'task_id': fields.String(description='ID of the embedding task') }) @document_ns.route('/add_url') class AddURL(Resource): @jwt_required() @requires_service('DOCAPI') @document_ns.expect(add_url_model) @document_ns.response(201, 'Document added successfully', add_url_response) @document_ns.response(400, 'Validation Error') @document_ns.response(500, 'Internal Server Error') def post(self): """ Add a new document to EveAI from a permanent URL. This endpoint is used for URLs that will remain accessible. The URL is stored and can be used to refresh the document's content later. """ tenant_id = get_jwt_identity() current_app.logger.info(f'Adding document from URL for tenant {tenant_id}') try: args = document_ns.payload cleaned_url = clean_url(args['url']) file_content, filename, extension = process_url(cleaned_url, tenant_id) api_input = { 'catalog_id': args['catalog_id'], 'url': cleaned_url, 'name': args.get('name') or filename, 'language': args['language'], 'user_context': args.get('user_context'), 'valid_from': args.get('valid_from'), 'user_metadata': args.get('user_metadata'), 'catalog_properties': args.get('catalog_properties'), } new_doc, new_doc_vers = create_document_stack(api_input, file_content, filename, extension, tenant_id) task_id = start_embedding_task(tenant_id, new_doc_vers.id) return { 'message': f'Processing on document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task_id}.', 'document_id': new_doc.id, 'document_version_id': new_doc_vers.id, 'task_id': task_id }, 201 except EveAIDoubleURLException: document_ns.abort(400, f'A document with URL {args["url"]} already exists.') except (EveAIInvalidLanguageException, EveAIUnsupportedFileType) as e: document_ns.abort(400, str(e)) except Exception as e: current_app.logger.error(f'Error adding document from URL: {str(e)}') document_ns.abort(500, 'Error adding document from URL') document_list_model = document_ns.model('DocumentList', { 'id': fields.Integer(description='Document ID'), 'name': fields.String(description='Document name'), 'valid_from': fields.DateTime(description='Valid from date'), 'valid_to': fields.DateTime(description='Valid to date'), }) @document_ns.route('/list') class DocumentList(Resource): @jwt_required() @requires_service('DOCAPI') @document_ns.doc('list_documents') @document_ns.marshal_list_with(document_list_model, envelope='documents') def get(self): """List all documents""" page = request.args.get('page', 1, type=int) per_page = request.args.get('per_page', 10, type=int) pagination = get_documents_list(page, per_page) return pagination.items, 200 edit_document_model = document_ns.model('EditDocument', { 'name': fields.String(required=True, description='New name for the document'), 'valid_from': fields.DateTime(required=False, description='New valid from date'), 'valid_to': fields.DateTime(required=False, description='New valid to date'), }) @document_ns.route('/') class DocumentResource(Resource): @jwt_required() @requires_service('DOCAPI') @document_ns.doc('edit_document') @document_ns.expect(edit_document_model) @document_ns.response(200, 'Document updated successfully') @document_ns.response(400, 'Validation Error') @document_ns.response(404, 'Document not found') @document_ns.response(500, 'Internal Server Error') def put(self, document_id): """Edit a document. The content of the document will not be refreshed!""" try: data = request.json tenant_id = get_jwt_identity() updated_doc, error = edit_document(tenant_id, document_id, data.get('name', None), data.get('valid_from', None), data.get('valid_to', None)) if updated_doc: return {'message': f'Document {updated_doc.id} updated successfully'}, 200 else: return {'message': f'Error updating document: {error}'}, 400 except EveAIException as e: return e.to_dict(), e.status_code @jwt_required() @requires_service('DOCAPI') @document_ns.doc('refresh_document') @document_ns.response(200, 'Document refreshed successfully') def post(self, document_id): """Refresh a document. In this case, the content of the document will be refreshed! This requires the document version to have a permanent and accessible URL!""" tenant_id = get_jwt_identity() new_version, result = refresh_document(document_id, tenant_id) if new_version: return {'message': f'Document refreshed. New version: {new_version.id}. Task ID: {result}'}, 200 else: return {'message': f'Error refreshing document: {result}'}, 400 edit_document_version_model = document_ns.model('EditDocumentVersion', { 'user_context': fields.String(required=True, description='New user context for the document version'), 'catalog_properties': fields.String(required=True, description='New catalog properties for the document version'), }) @document_ns.route('/version/') class DocumentVersionResource(Resource): @jwt_required() @requires_service('DOCAPI') @document_ns.doc('edit_document_version') @document_ns.expect(edit_document_version_model) @document_ns.response(200, 'Document version updated successfully') def put(self, version_id): """Edit a document version""" data = request.json tenant_id = get_jwt_identity() updated_version, error = edit_document_version(tenant_id, version_id, data['user_context'], data.get('catalog_properties')) if updated_version: return {'message': f'Document Version {updated_version.id} updated successfully'}, 200 else: return {'message': f'Error updating document version: {error}'}, 400 # Define the model for the request body of refresh_with_info refresh_document_model = document_ns.model('RefreshDocument', { 'name': fields.String(required=False, description='New name for the document'), 'language': fields.String(required=False, description='Language of the document'), 'user_context': fields.String(required=False, description='User context for the document'), 'user_metadata': fields.Raw(required=False, description='User metadata for the document'), 'catalog_properties': fields.Raw(required=False, description='Catalog properties for the document'), }) @document_ns.route('//refresh') class RefreshDocument(Resource): @jwt_required() @requires_service('DOCAPI') @document_ns.response(200, 'Document refreshed successfully') @document_ns.response(404, 'Document not found') def post(self, document_id): """ Refresh a document without additional information. In this case, the content of the document will be refreshed! This requires the document version to have a permanent and accessible URL! """ tenant_id = get_jwt_identity() current_app.logger.info(f'Refreshing document {document_id} for tenant {tenant_id}') try: new_version, result = refresh_document(document_id, tenant_id) if new_version: return { 'message': f'Document refreshed successfully. New version: {new_version.id}. Task ID: {result}', 'document_id': document_id, 'document_version_id': new_version.id, 'task_id': result }, 200 else: return {'message': f'Error refreshing document: {result}'}, 400 except Exception as e: current_app.logger.error(f'Error refreshing document: {str(e)}') return {'message': 'Internal server error'}, 500 @document_ns.route('//refresh_with_info') class RefreshDocumentWithInfo(Resource): @jwt_required() @requires_service('DOCAPI') @document_ns.expect(refresh_document_model) @document_ns.response(200, 'Document refreshed successfully') @document_ns.response(400, 'Validation Error') @document_ns.response(404, 'Document not found') def post(self, document_id): """ Refresh a document with new version information. """ tenant_id = get_jwt_identity() current_app.logger.info(f'Refreshing document {document_id} with info for tenant {tenant_id}') try: api_input = request.json new_version, result = refresh_document_with_info(document_id, tenant_id, api_input) if new_version: return { 'message': f'Document refreshed successfully with new info. New version: {new_version.id}. Task ID: {result}', 'document_id': document_id, 'document_version_id': new_version.id, 'task_id': result }, 200 else: return {'message': f'Error refreshing document with info: {result}'}, 400 except Exception as e: current_app.logger.error(f'Error refreshing document with info: {str(e)}') return {'message': 'Internal server error'}, 500 # Define models for lookup requests lookup_model = document_ns.model('DocumentLookup', { 'lookup_criteria': fields.Raw(required=True, description='JSON object containing key-value pairs to match in metadata. ' 'Example: {"external_id": "123", "source": "zapier", "source_type": "google_docs"}'), 'metadata_type': fields.String(required=True, enum=['user_metadata', 'system_metadata'], description='Which metadata field to search in') }) lookup_response = document_ns.model('DocumentLookupResponse', { 'document_id': fields.Integer(description='ID of the found document'), 'document_version_id': fields.Integer(description='ID of the latest document version'), 'name': fields.String(description='Document name'), 'metadata': fields.Raw(description='Full metadata of the found document') }) @document_ns.route('/lookup') class DocumentLookup(Resource): @jwt_required() @requires_service('DOCAPI') @document_ns.expect(lookup_model) @document_ns.marshal_with(lookup_response) @document_ns.response(200, 'Document found', lookup_response) @document_ns.response(404, 'No document found matching criteria') def post(self): """ Look up a document using metadata criteria """ tenant_id = get_jwt_identity() try: data = request.json document, version = lookup_document( tenant_id, data['lookup_criteria'], data['metadata_type'] ) return { 'document_id': document.id, 'document_version_id': version.id, 'name': document.name, 'metadata': getattr(version, data['metadata_type']) } except EveAIException as e: return e.to_dict(), e.status_code except KeyError as e: return {'message': f'Missing required field: {str(e)}'}, 400 refresh_url_model = document_ns.model('RefreshDocumentThroughURL', { 'temp_url': fields.String(required=True, description='Temporary URL of the updated document content'), 'language': fields.String(required=False, description='Language of the document'), 'user_context': fields.String(required=False, description='User context for the document'), 'user_metadata': fields.Raw(required=False, description='Custom metadata fields'), 'catalog_properties': fields.Raw(required=False, description='Catalog-specific properties'), }) @document_ns.route('//refresh_through_url') class RefreshDocumentThroughURL(Resource): @jwt_required() @requires_service('DOCAPI') @document_ns.expect(refresh_url_model) @document_ns.response(200, 'Document refreshed successfully') def post(self, document_id): """Refresh a document using content from a URL""" tenant_id = get_jwt_identity() current_app.logger.info(f'Refreshing document {document_id} through URL for tenant {tenant_id}') try: # Get filename from the existing version old_doc_vers = (DocumentVersion.query.filter_by(doc_id=document_id). order_by(desc(DocumentVersion.id)).first()) filename = f"{old_doc_vers.id}.{old_doc_vers.file_type}" args = request.json user_metadata = json.loads(args.get('user_metadata', '{}')) try: actual_file_content, actual_file_content_type = download_file_content(args['temp_url'], user_metadata) file_content = io.BytesIO(actual_file_content) file = FileStorage( stream=file_content, filename=filename, content_type=actual_file_content_type ) new_version, task_id = refresh_document_with_content( document_id, tenant_id, actual_file_content, args ) return { 'message': f'Document refreshed successfully. New version: {new_version.id}. Task ID: {task_id}', 'document_id': document_id, 'document_version_id': new_version.id, 'task_id': task_id }, 200 except requests.RequestException as e: current_app.logger.error(f"Error downloading file: {str(e)}") return {'message': f'Error downloading file: {str(e)}'}, 422 except EveAIException as e: return e.to_dict(), e.status_code def download_file_content(url: str, user_metadata: dict) -> tuple[Any, Any]: if user_metadata and 'service' in user_metadata and 'Zapier' in user_metadata['service']: # Zapier uses a system of Stashed URLs # Step 1: Download from stashed URL stashed_url = url current_app.logger.info(f"Downloading stashed file from URL: {stashed_url}") response = requests.get(stashed_url, stream=True) response.raise_for_status() hydration_url = response.text.strip() current_app.logger.info(f"Downloading actual file from URL: {hydration_url}") # Step 2: Download from hydration URL actual_file_response = requests.get(hydration_url, stream=True) actual_file_response.raise_for_status() actual_file_content = actual_file_response.content else: actual_url = url actual_file_response = requests.get(actual_url, stream=True) actual_file_response.raise_for_status() actual_file_content = actual_file_response.content actual_file_content_type = actual_file_response.headers.get('content-type', 'application/octet-stream') return actual_file_content, actual_file_content_type