import docx import io from .base_processor import BaseProcessor from .processor_registry import ProcessorRegistry from common.extensions import minio_client import re class DocxProcessor(BaseProcessor): def __init__(self, tenant, document_version, catalog, processor): super().__init__(tenant, document_version, catalog, processor) self.config = processor.configuration self.extract_comments = self.config.get('extract_comments', False) self.extract_headers_footers = self.config.get('extract_headers_footers', False) self.preserve_formatting = self.config.get('preserve_formatting', True) self.list_style = self.config.get('list_style', 'dash') self.image_handling = self.config.get('image_handling', 'skip') self.table_alignment = self.config.get('table_alignment', 'left') def process(self): try: file_data = minio_client.download_document_file( self.tenant.id, self.document_version.bucket_name, self.document_version.object_name, ) doc = docx.Document(io.BytesIO(file_data)) markdown = self._convert_to_markdown(doc) title = self._extract_title(doc) self._save_markdown(markdown) return markdown, title except Exception as e: self._log(f"Error processing DOCX: {str(e)}", level='error') raise def _convert_to_markdown(self, doc): markdown_parts = [] if self.extract_headers_footers: for section in doc.sections: if section.header.paragraphs: markdown_parts.extend(self._process_paragraphs(section.header.paragraphs)) markdown_parts.extend(self._process_paragraphs(doc.paragraphs)) if self.extract_comments and doc.comments: markdown_parts.append("\n## Comments\n") for comment in doc.comments: markdown_parts.append(f"> {comment.text}\n") return "\n".join(markdown_parts) def _process_paragraphs(self, paragraphs): markdown_parts = [] in_list = False for para in paragraphs: if not para.text.strip(): continue style = para.style.name.lower() if 'heading' in style: level = int(style[-1]) if style[-1].isdigit() else 1 markdown_parts.append(f"{'#' * level} {para.text}\n") elif para._p.pPr and para._p.pPr.numPr: # List item marker = self._get_list_marker() markdown_parts.append(f"{marker} {para.text}\n") in_list = True else: if in_list: markdown_parts.append("\n") in_list = False text = para.text if self.preserve_formatting: text = self._apply_formatting(para) markdown_parts.append(f"{text}\n") return markdown_parts def _get_list_marker(self): return { 'dash': '-', 'asterisk': '*', 'plus': '+' }.get(self.list_style, '-') def _apply_formatting(self, paragraph): text = paragraph.text if not text: return "" runs = paragraph.runs formatted_parts = [] for run in runs: part = run.text if run.bold: part = f"**{part}**" if run.italic: part = f"*{part}*" if run.underline: part = f"__{part}__" formatted_parts.append(part) return "".join(formatted_parts) def _extract_title(self, doc): if doc.paragraphs: first_para = doc.paragraphs[0] if 'heading' in first_para.style.name.lower(): return first_para.text.strip() # Look for first Heading 1 in document for para in doc.paragraphs: if para.style.name.lower() == 'heading 1': return para.text.strip() return "Untitled Document" ProcessorRegistry.register("DOCX_PROCESSOR", DocxProcessor)