- Addition of general chunking parameters chunking_heading_level and chunking patterns

- Addition of Processor types docx and markdown
This commit is contained in:
Josako
2024-12-05 15:19:37 +01:00
parent 311927d5ea
commit d35ec9f5ae
17 changed files with 718 additions and 66 deletions

View File

@@ -0,0 +1,129 @@
import docx
import io
from .base_processor import BaseProcessor
from .processor_registry import ProcessorRegistry
from common.extensions import minio_client
import re
class DocxProcessor(BaseProcessor):
def __init__(self, tenant, model_variables, document_version, catalog, processor):
super().__init__(tenant, model_variables, document_version, catalog, processor)
self.config = processor.configuration
self.extract_comments = self.config.get('extract_comments', False)
self.extract_headers_footers = self.config.get('extract_headers_footers', False)
self.preserve_formatting = self.config.get('preserve_formatting', True)
self.list_style = self.config.get('list_style', 'dash')
self.image_handling = self.config.get('image_handling', 'skip')
self.table_alignment = self.config.get('table_alignment', 'left')
def process(self):
try:
file_data = minio_client.download_document_file(
self.tenant.id,
self.document_version.bucket_name,
self.document_version.object_name,
)
doc = docx.Document(io.BytesIO(file_data))
markdown = self._convert_to_markdown(doc)
title = self._extract_title(doc)
self._save_markdown(markdown)
return markdown, title
except Exception as e:
self._log(f"Error processing DOCX: {str(e)}", level='error')
raise
def _convert_to_markdown(self, doc):
markdown_parts = []
if self.extract_headers_footers:
for section in doc.sections:
if section.header.paragraphs:
markdown_parts.extend(self._process_paragraphs(section.header.paragraphs))
markdown_parts.extend(self._process_paragraphs(doc.paragraphs))
if self.extract_comments and doc.comments:
markdown_parts.append("\n## Comments\n")
for comment in doc.comments:
markdown_parts.append(f"> {comment.text}\n")
return "\n".join(markdown_parts)
def _process_paragraphs(self, paragraphs):
markdown_parts = []
in_list = False
for para in paragraphs:
if not para.text.strip():
continue
style = para.style.name.lower()
if 'heading' in style:
level = int(style[-1]) if style[-1].isdigit() else 1
markdown_parts.append(f"{'#' * level} {para.text}\n")
elif para._p.pPr and para._p.pPr.numPr: # List item
marker = self._get_list_marker()
markdown_parts.append(f"{marker} {para.text}\n")
in_list = True
else:
if in_list:
markdown_parts.append("\n")
in_list = False
text = para.text
if self.preserve_formatting:
text = self._apply_formatting(para)
markdown_parts.append(f"{text}\n")
return markdown_parts
def _get_list_marker(self):
return {
'dash': '-',
'asterisk': '*',
'plus': '+'
}.get(self.list_style, '-')
def _apply_formatting(self, paragraph):
text = paragraph.text
if not text:
return ""
runs = paragraph.runs
formatted_parts = []
for run in runs:
part = run.text
if run.bold:
part = f"**{part}**"
if run.italic:
part = f"*{part}*"
if run.underline:
part = f"__{part}__"
formatted_parts.append(part)
return "".join(formatted_parts)
def _extract_title(self, doc):
if doc.paragraphs:
first_para = doc.paragraphs[0]
if 'heading' in first_para.style.name.lower():
return first_para.text.strip()
# Look for first Heading 1 in document
for para in doc.paragraphs:
if para.style.name.lower() == 'heading 1':
return para.text.strip()
return "Untitled Document"
ProcessorRegistry.register("DOCX_PROCESSOR", DocxProcessor)