- Addition of general chunking parameters chunking_heading_level and chunking patterns

- Addition of Processor types docx and markdown
This commit is contained in:
Josako
2024-12-05 15:19:37 +01:00
parent 311927d5ea
commit d35ec9f5ae
17 changed files with 718 additions and 66 deletions

View File

@@ -46,7 +46,7 @@ class AudioProcessor(TranscriptionBaseProcessor):
try:
audio_info = AudioSegment.from_file(temp_file_path, format=self.document_version.file_type)
total_duration = len(audio_info)
self._log_tuning("_compress_audio", {
self.log_tuning("_compress_audio", {
"Audio Duration (ms)": total_duration,
})
segment_length = self.max_compression_duration * 1000 # Convert to milliseconds
@@ -55,7 +55,7 @@ class AudioProcessor(TranscriptionBaseProcessor):
compressed_segments = AudioSegment.empty()
for i in range(total_chunks):
self._log_tuning("_compress_audio", {
self.log_tuning("_compress_audio", {
"Segment Nr": f"{i + 1} of {total_chunks}"
})
@@ -87,7 +87,7 @@ class AudioProcessor(TranscriptionBaseProcessor):
compressed_filename,
compressed_buffer.read()
)
self._log_tuning("_compress_audio", {
self.log_tuning("_compress_audio", {
"Compressed audio to MinIO": compressed_filename
})
@@ -172,14 +172,14 @@ class AudioProcessor(TranscriptionBaseProcessor):
transcriptions.append(trans)
self._log_tuning("_transcribe_audio", {
self.log_tuning("_transcribe_audio", {
"Chunk Nr": f"{i + 1} of {total_chunks}",
"Segment Duration": segment_duration,
"Transcription": trans,
})
else:
self._log("Warning: Received empty transcription", level='warning')
self._log_tuning("_transcribe_audio", {"ERROR": "No transcription"})
self.log_tuning("_transcribe_audio", {"ERROR": "No transcription"})
except Exception as e:
self._log(f"Error during transcription: {str(e)}", level='error')
@@ -202,7 +202,7 @@ class AudioProcessor(TranscriptionBaseProcessor):
transcription_filename,
full_transcription.encode('utf-8')
)
self._log_tuning(f"Saved transcription to MinIO: {transcription_filename}")
self.log_tuning(f"Saved transcription to MinIO: {transcription_filename}")
return full_transcription

View File

@@ -17,7 +17,7 @@ class BaseProcessor(ABC):
self.tuning_logger = None
self._setup_tuning_logger()
self._log_tuning("Processor initialized", {
self.log_tuning("Processor initialized", {
"processor_type": processor.type if processor else None,
"document_version": document_version.id if document_version else None,
"catalog": catalog.id if catalog else None
@@ -42,6 +42,10 @@ class BaseProcessor(ABC):
def process(self):
pass
@property
def configuration(self):
return self.processor.configuration
def _save_markdown(self, markdown):
markdown_filename = f"{self.document_version.id}.md"
minio_client.upload_document_file(
@@ -78,7 +82,7 @@ class BaseProcessor(ABC):
return markdown
def _log_tuning(self, message: str, data: Dict[str, Any] = None) -> None:
def log_tuning(self, message: str, data: Dict[str, Any] = None) -> None:
if self.tuning and self.tuning_logger:
try:
self.tuning_logger.log_tuning('processor', message, data)

View File

@@ -0,0 +1,129 @@
import docx
import io
from .base_processor import BaseProcessor
from .processor_registry import ProcessorRegistry
from common.extensions import minio_client
import re
class DocxProcessor(BaseProcessor):
def __init__(self, tenant, model_variables, document_version, catalog, processor):
super().__init__(tenant, model_variables, document_version, catalog, processor)
self.config = processor.configuration
self.extract_comments = self.config.get('extract_comments', False)
self.extract_headers_footers = self.config.get('extract_headers_footers', False)
self.preserve_formatting = self.config.get('preserve_formatting', True)
self.list_style = self.config.get('list_style', 'dash')
self.image_handling = self.config.get('image_handling', 'skip')
self.table_alignment = self.config.get('table_alignment', 'left')
def process(self):
try:
file_data = minio_client.download_document_file(
self.tenant.id,
self.document_version.bucket_name,
self.document_version.object_name,
)
doc = docx.Document(io.BytesIO(file_data))
markdown = self._convert_to_markdown(doc)
title = self._extract_title(doc)
self._save_markdown(markdown)
return markdown, title
except Exception as e:
self._log(f"Error processing DOCX: {str(e)}", level='error')
raise
def _convert_to_markdown(self, doc):
markdown_parts = []
if self.extract_headers_footers:
for section in doc.sections:
if section.header.paragraphs:
markdown_parts.extend(self._process_paragraphs(section.header.paragraphs))
markdown_parts.extend(self._process_paragraphs(doc.paragraphs))
if self.extract_comments and doc.comments:
markdown_parts.append("\n## Comments\n")
for comment in doc.comments:
markdown_parts.append(f"> {comment.text}\n")
return "\n".join(markdown_parts)
def _process_paragraphs(self, paragraphs):
markdown_parts = []
in_list = False
for para in paragraphs:
if not para.text.strip():
continue
style = para.style.name.lower()
if 'heading' in style:
level = int(style[-1]) if style[-1].isdigit() else 1
markdown_parts.append(f"{'#' * level} {para.text}\n")
elif para._p.pPr and para._p.pPr.numPr: # List item
marker = self._get_list_marker()
markdown_parts.append(f"{marker} {para.text}\n")
in_list = True
else:
if in_list:
markdown_parts.append("\n")
in_list = False
text = para.text
if self.preserve_formatting:
text = self._apply_formatting(para)
markdown_parts.append(f"{text}\n")
return markdown_parts
def _get_list_marker(self):
return {
'dash': '-',
'asterisk': '*',
'plus': '+'
}.get(self.list_style, '-')
def _apply_formatting(self, paragraph):
text = paragraph.text
if not text:
return ""
runs = paragraph.runs
formatted_parts = []
for run in runs:
part = run.text
if run.bold:
part = f"**{part}**"
if run.italic:
part = f"*{part}*"
if run.underline:
part = f"__{part}__"
formatted_parts.append(part)
return "".join(formatted_parts)
def _extract_title(self, doc):
if doc.paragraphs:
first_para = doc.paragraphs[0]
if 'heading' in first_para.style.name.lower():
return first_para.text.strip()
# Look for first Heading 1 in document
for para in doc.paragraphs:
if para.style.name.lower() == 'heading 1':
return para.text.strip()
return "Untitled Document"
ProcessorRegistry.register("DOCX_PROCESSOR", DocxProcessor)

View File

@@ -24,7 +24,7 @@ class HTMLProcessor(BaseProcessor):
# Add verification logging
self._log(f"HTML Processor initialized with tuning={self.tuning}")
if self.tuning:
self._log_tuning("HTML Processor initialized", {
self.log_tuning("HTML Processor initialized", {
"html_tags": self.html_tags,
"html_end_tags": self.html_end_tags,
"included_elements": self.html_included_elements,
@@ -75,7 +75,7 @@ class HTMLProcessor(BaseProcessor):
title = soup.find('title').get_text(strip=True) if soup.find('title') else ''
self._log(f'Finished parsing HTML for tenant {self.tenant.id}')
self._log_tuning("_parse_html", {"extracted_html": extracted_html, "title": title})
self.log_tuning("_parse_html", {"extracted_html": extracted_html, "title": title})
return extracted_html, title
def _generate_markdown_from_html(self, html_content):
@@ -96,7 +96,7 @@ class HTMLProcessor(BaseProcessor):
input_html = {"html": chunk}
markdown_chunk = chain.invoke(input_html)
markdown_chunks.append(markdown_chunk)
self._log_tuning("_generate_markdown_from_html", {"chunk": chunk, "markdown_chunk": markdown_chunk})
self.log_tuning("_generate_markdown_from_html", {"chunk": chunk, "markdown_chunk": markdown_chunk})
markdown = "\n\n".join(markdown_chunks)
self._log(f'Finished generating markdown from HTML for tenant {self.tenant.id}')

View File

@@ -0,0 +1,48 @@
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
import re
from langchain_core.runnables import RunnablePassthrough
from common.extensions import minio_client
from common.utils.model_utils import create_language_template
from .base_processor import BaseProcessor
from common.utils.business_event_context import current_event
from .processor_registry import ProcessorRegistry
def _find_first_h1(markdown: str) -> str:
# Look for # Header (allowing spaces after #)
match = re.search(r'^#\s+(.+)$', markdown, re.MULTILINE)
return match.group(1).strip() if match else ""
class MarkdownProcessor(BaseProcessor):
def __init__(self, tenant, model_variables, document_version, catalog, processor):
super().__init__(tenant, model_variables, document_version, catalog, processor)
self.chunk_size = catalog.max_chunk_size
self.chunk_overlap = 0
self.tuning = self.processor.tuning
def process(self):
self._log("Starting Markdown processing")
try:
file_data = minio_client.download_document_file(
self.tenant.id,
self.document_version.bucket_name,
self.document_version.object_name,
)
markdown = file_data.decode('utf-8')
title = _find_first_h1(markdown)
self._save_markdown(markdown)
self._log("Finished processing Markdown")
return markdown, title
except Exception as e:
self._log(f"Error processing Markdown: {str(e)}", level='error')
raise
ProcessorRegistry.register("MARKDOWN_PROCESSOR", MarkdownProcessor)

View File

@@ -57,7 +57,7 @@ class PDFProcessor(BaseProcessor):
'figures': self._extract_figures(page, page_num, figure_counter),
'tables': self._extract_tables(page)
}
self._log_tuning("_extract_content", {"page_num": page_num, "page_content": page_content})
self.log_tuning("_extract_content", {"page_num": page_num, "page_content": page_content})
figure_counter += len(page_content['figures'])
extracted_content.append(page_content)
@@ -119,7 +119,7 @@ class PDFProcessor(BaseProcessor):
markdown_table = self._table_to_markdown(table)
if markdown_table: # Only add non-empty tables
tables.append(markdown_table)
self._log_tuning("_extract_tables", {"markdown_table": markdown_table})
self.log_tuning("_extract_tables", {"markdown_table": markdown_table})
except Exception as e:
self._log(f"Error extracting tables from page: {str(e)}", level='error')
return tables

View File

@@ -45,7 +45,7 @@ class TranscriptionBaseProcessor(BaseProcessor):
return text_splitter.split_text(transcription)
def _process_chunks(self, chunks):
self._log_tuning("_process_chunks", {"Nr of Chunks": len(chunks)})
self.log_tuning("_process_chunks", {"Nr of Chunks": len(chunks)})
llm = self.model_variables.get_llm()
template = self.model_variables.get_template('transcript')
language_template = create_language_template(template, self.document_version.language)
@@ -64,7 +64,7 @@ class TranscriptionBaseProcessor(BaseProcessor):
}
markdown = chain.invoke(input_transcript)
markdown = self._clean_markdown(markdown)
self._log_tuning("_process_chunks", {
self.log_tuning("_process_chunks", {
"Chunk Number": f"{i + 1} of {len(chunks)}",
"Chunk": chunk,
"Previous Chunk": previous_part,