Files
eveAI/eveai_workers/processors/markdown_processor.py

49 lines
1.7 KiB
Python

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
import re
from langchain_core.runnables import RunnablePassthrough
from common.extensions import minio_client
from common.utils.model_utils import create_language_template
from .base_processor import BaseProcessor
from common.utils.business_event_context import current_event
from .processor_registry import ProcessorRegistry
def _find_first_h1(markdown: str) -> str:
# Look for # Header (allowing spaces after #)
match = re.search(r'^#\s+(.+)$', markdown, re.MULTILINE)
return match.group(1).strip() if match else ""
class MarkdownProcessor(BaseProcessor):
def __init__(self, tenant, document_version, catalog, processor):
super().__init__(tenant, document_version, catalog, processor)
self.chunk_size = catalog.max_chunk_size
self.chunk_overlap = 0
self.tuning = self.processor.tuning
def process(self):
self._log("Starting Markdown processing")
try:
file_data = minio_client.download_document_file(
self.tenant.id,
self.document_version.bucket_name,
self.document_version.object_name,
)
markdown = file_data.decode('utf-8')
title = _find_first_h1(markdown)
self._save_markdown(markdown)
self._log("Finished processing Markdown")
return markdown, title
except Exception as e:
self._log(f"Error processing Markdown: {str(e)}", level='error')
raise
ProcessorRegistry.register("MARKDOWN_PROCESSOR", MarkdownProcessor)