129 lines
4.1 KiB
Python
129 lines
4.1 KiB
Python
import docx
|
|
import io
|
|
from .base_processor import BaseProcessor
|
|
from .processor_registry import ProcessorRegistry
|
|
from common.extensions import minio_client
|
|
import re
|
|
|
|
|
|
class DocxProcessor(BaseProcessor):
|
|
def __init__(self, tenant, model_variables, document_version, catalog, processor):
|
|
super().__init__(tenant, model_variables, document_version, catalog, processor)
|
|
self.config = processor.configuration
|
|
self.extract_comments = self.config.get('extract_comments', False)
|
|
self.extract_headers_footers = self.config.get('extract_headers_footers', False)
|
|
self.preserve_formatting = self.config.get('preserve_formatting', True)
|
|
self.list_style = self.config.get('list_style', 'dash')
|
|
self.image_handling = self.config.get('image_handling', 'skip')
|
|
self.table_alignment = self.config.get('table_alignment', 'left')
|
|
|
|
def process(self):
|
|
try:
|
|
file_data = minio_client.download_document_file(
|
|
self.tenant.id,
|
|
self.document_version.bucket_name,
|
|
self.document_version.object_name,
|
|
)
|
|
|
|
doc = docx.Document(io.BytesIO(file_data))
|
|
markdown = self._convert_to_markdown(doc)
|
|
title = self._extract_title(doc)
|
|
|
|
self._save_markdown(markdown)
|
|
return markdown, title
|
|
|
|
except Exception as e:
|
|
self._log(f"Error processing DOCX: {str(e)}", level='error')
|
|
raise
|
|
|
|
def _convert_to_markdown(self, doc):
|
|
markdown_parts = []
|
|
|
|
if self.extract_headers_footers:
|
|
for section in doc.sections:
|
|
if section.header.paragraphs:
|
|
markdown_parts.extend(self._process_paragraphs(section.header.paragraphs))
|
|
|
|
markdown_parts.extend(self._process_paragraphs(doc.paragraphs))
|
|
|
|
if self.extract_comments and doc.comments:
|
|
markdown_parts.append("\n## Comments\n")
|
|
for comment in doc.comments:
|
|
markdown_parts.append(f"> {comment.text}\n")
|
|
|
|
return "\n".join(markdown_parts)
|
|
|
|
def _process_paragraphs(self, paragraphs):
|
|
markdown_parts = []
|
|
in_list = False
|
|
|
|
for para in paragraphs:
|
|
if not para.text.strip():
|
|
continue
|
|
|
|
style = para.style.name.lower()
|
|
|
|
if 'heading' in style:
|
|
level = int(style[-1]) if style[-1].isdigit() else 1
|
|
markdown_parts.append(f"{'#' * level} {para.text}\n")
|
|
|
|
elif para._p.pPr and para._p.pPr.numPr: # List item
|
|
marker = self._get_list_marker()
|
|
markdown_parts.append(f"{marker} {para.text}\n")
|
|
in_list = True
|
|
|
|
else:
|
|
if in_list:
|
|
markdown_parts.append("\n")
|
|
in_list = False
|
|
|
|
text = para.text
|
|
if self.preserve_formatting:
|
|
text = self._apply_formatting(para)
|
|
|
|
markdown_parts.append(f"{text}\n")
|
|
|
|
return markdown_parts
|
|
|
|
def _get_list_marker(self):
|
|
return {
|
|
'dash': '-',
|
|
'asterisk': '*',
|
|
'plus': '+'
|
|
}.get(self.list_style, '-')
|
|
|
|
def _apply_formatting(self, paragraph):
|
|
text = paragraph.text
|
|
if not text:
|
|
return ""
|
|
|
|
runs = paragraph.runs
|
|
formatted_parts = []
|
|
|
|
for run in runs:
|
|
part = run.text
|
|
if run.bold:
|
|
part = f"**{part}**"
|
|
if run.italic:
|
|
part = f"*{part}*"
|
|
if run.underline:
|
|
part = f"__{part}__"
|
|
formatted_parts.append(part)
|
|
|
|
return "".join(formatted_parts)
|
|
|
|
def _extract_title(self, doc):
|
|
if doc.paragraphs:
|
|
first_para = doc.paragraphs[0]
|
|
if 'heading' in first_para.style.name.lower():
|
|
return first_para.text.strip()
|
|
|
|
# Look for first Heading 1 in document
|
|
for para in doc.paragraphs:
|
|
if para.style.name.lower() == 'heading 1':
|
|
return para.text.strip()
|
|
|
|
return "Untitled Document"
|
|
|
|
|
|
ProcessorRegistry.register("DOCX_PROCESSOR", DocxProcessor) |