from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_core.output_parsers import StrOutputParser from langchain_core.prompts import ChatPromptTemplate import re from langchain_core.runnables import RunnablePassthrough from common.extensions import minio_client from common.utils.model_utils import create_language_template from .base_processor import BaseProcessor from common.utils.business_event_context import current_event from .processor_registry import ProcessorRegistry def _find_first_h1(markdown: str) -> str: # Look for # Header (allowing spaces after #) match = re.search(r'^#\s+(.+)$', markdown, re.MULTILINE) return match.group(1).strip() if match else "" class MarkdownProcessor(BaseProcessor): def __init__(self, tenant, model_variables, document_version, catalog, processor): super().__init__(tenant, model_variables, document_version, catalog, processor) self.chunk_size = catalog.max_chunk_size self.chunk_overlap = 0 self.tuning = self.processor.tuning def process(self): self._log("Starting Markdown processing") try: file_data = minio_client.download_document_file( self.tenant.id, self.document_version.bucket_name, self.document_version.object_name, ) markdown = file_data.decode('utf-8') title = _find_first_h1(markdown) self._save_markdown(markdown) self._log("Finished processing Markdown") return markdown, title except Exception as e: self._log(f"Error processing Markdown: {str(e)}", level='error') raise ProcessorRegistry.register("MARKDOWN_PROCESSOR", MarkdownProcessor)