- Introduction of dynamic Retrievers & Specialists
- Introduction of dynamic Processors - Introduction of caching system - Introduction of a better template manager - Adaptation of ModelVariables to support dynamic Processors / Retrievers / Specialists - Start adaptation of chat client
This commit is contained in:
5
eveai_workers/processors/__init__.py
Normal file
5
eveai_workers/processors/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
# Import all processor implementations to ensure registration
|
||||
from . import audio_processor, html_processor, pdf_processor
|
||||
|
||||
# List of all available processor implementations
|
||||
__all__ = ['audio_processor', 'html_processor', 'pdf_processor']
|
||||
211
eveai_workers/processors/audio_processor.py
Normal file
211
eveai_workers/processors/audio_processor.py
Normal file
@@ -0,0 +1,211 @@
|
||||
import io
|
||||
import os
|
||||
import time
|
||||
|
||||
import psutil
|
||||
from pydub import AudioSegment
|
||||
import tempfile
|
||||
from common.extensions import minio_client
|
||||
import subprocess
|
||||
|
||||
from .processor_registry import ProcessorRegistry
|
||||
from .transcription_processor import TranscriptionBaseProcessor
|
||||
from common.utils.business_event_context import current_event
|
||||
|
||||
|
||||
class AudioProcessor(TranscriptionBaseProcessor):
|
||||
def __init__(self, tenant, model_variables, document_version, catalog, processor):
|
||||
super().__init__(tenant, model_variables, document_version, catalog, processor)
|
||||
self.transcription_model = model_variables.transcription_model
|
||||
self.ffmpeg_path = 'ffmpeg'
|
||||
self.max_compression_duration = model_variables.max_compression_duration
|
||||
self.max_transcription_duration = model_variables.max_transcription_duration
|
||||
self.compression_cpu_limit = model_variables.compression_cpu_limit # CPU usage limit in percentage
|
||||
self.compression_process_delay = model_variables.compression_process_delay # Delay between processing chunks in seconds
|
||||
self.file_type = document_version.file_type
|
||||
|
||||
def _get_transcription(self):
|
||||
file_data = minio_client.download_document_file(
|
||||
self.tenant.id,
|
||||
self.document_version.bucket_name,
|
||||
self.document_version.object_name,
|
||||
)
|
||||
|
||||
with current_event.create_span("Audio Compression"):
|
||||
compressed_audio = self._compress_audio(file_data)
|
||||
with current_event.create_span("Audio Transcription"):
|
||||
transcription = self._transcribe_audio(compressed_audio)
|
||||
|
||||
return transcription
|
||||
|
||||
def _compress_audio(self, audio_data):
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{self.document_version.file_type}') as temp_file:
|
||||
temp_file.write(audio_data)
|
||||
temp_file_path = temp_file.name
|
||||
|
||||
try:
|
||||
audio_info = AudioSegment.from_file(temp_file_path, format=self.document_version.file_type)
|
||||
total_duration = len(audio_info)
|
||||
self._log_tuning("_compress_audio", {
|
||||
"Audio Duration (ms)": total_duration,
|
||||
})
|
||||
segment_length = self.max_compression_duration * 1000 # Convert to milliseconds
|
||||
total_chunks = (total_duration + segment_length - 1) // segment_length
|
||||
|
||||
compressed_segments = AudioSegment.empty()
|
||||
|
||||
for i in range(total_chunks):
|
||||
self._log_tuning("_compress_audio", {
|
||||
"Segment Nr": f"{i + 1} of {total_chunks}"
|
||||
})
|
||||
|
||||
start_time = i * segment_length
|
||||
end_time = min((i + 1) * segment_length, total_duration)
|
||||
|
||||
chunk = AudioSegment.from_file(
|
||||
temp_file_path,
|
||||
format=self.document_version.file_type,
|
||||
start_second=start_time / 1000,
|
||||
duration=(end_time - start_time) / 1000
|
||||
)
|
||||
|
||||
compressed_chunk = self._compress_segment(chunk)
|
||||
compressed_segments += compressed_chunk
|
||||
|
||||
time.sleep(self.compression_process_delay)
|
||||
|
||||
# Save compressed audio to MinIO
|
||||
compressed_filename = f"{self.document_version.id}_compressed.mp3"
|
||||
with io.BytesIO() as compressed_buffer:
|
||||
compressed_segments.export(compressed_buffer, format="mp3")
|
||||
compressed_buffer.seek(0)
|
||||
minio_client.upload_document_file(
|
||||
self.tenant.id,
|
||||
self.document_version.doc_id,
|
||||
self.document_version.language,
|
||||
self.document_version.id,
|
||||
compressed_filename,
|
||||
compressed_buffer.read()
|
||||
)
|
||||
self._log_tuning("_compress_audio", {
|
||||
"Compressed audio to MinIO": compressed_filename
|
||||
})
|
||||
|
||||
return compressed_segments
|
||||
|
||||
except Exception as e:
|
||||
self._log(f"Error during audio processing: {str(e)}", level='error')
|
||||
raise
|
||||
finally:
|
||||
os.unlink(temp_file_path) # Ensure the temporary file is deleted
|
||||
|
||||
def _compress_segment(self, audio_segment):
|
||||
with io.BytesIO() as segment_buffer:
|
||||
audio_segment.export(segment_buffer, format="wav")
|
||||
segment_buffer.seek(0)
|
||||
|
||||
with io.BytesIO() as output_buffer:
|
||||
command = [
|
||||
'nice', '-n', '19',
|
||||
'ffmpeg',
|
||||
'-i', 'pipe:0',
|
||||
'-ar', '16000',
|
||||
'-ac', '1',
|
||||
'-b:a', '32k',
|
||||
'-filter:a', 'loudnorm',
|
||||
'-f', 'mp3',
|
||||
'pipe:1'
|
||||
]
|
||||
|
||||
process = psutil.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
|
||||
stdout, stderr = process.communicate(input=segment_buffer.read())
|
||||
|
||||
if process.returncode != 0:
|
||||
self._log(f"FFmpeg error: {stderr.decode()}", level='error')
|
||||
raise Exception("FFmpeg compression failed")
|
||||
|
||||
output_buffer.write(stdout)
|
||||
output_buffer.seek(0)
|
||||
compressed_segment = AudioSegment.from_mp3(output_buffer)
|
||||
|
||||
return compressed_segment
|
||||
|
||||
def _transcribe_audio(self, audio_data):
|
||||
# audio = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
|
||||
audio = audio_data
|
||||
|
||||
segment_length = self.max_transcription_duration * 1000 # calculate milliseconds
|
||||
transcriptions = []
|
||||
total_chunks = len(audio) // segment_length + 1
|
||||
|
||||
for i, chunk in enumerate(audio[::segment_length]):
|
||||
segment_duration = 0
|
||||
if i == total_chunks - 1:
|
||||
segment_duration = (len(audio) % segment_length) // 1000
|
||||
else:
|
||||
segment_duration = self.max_transcription_duration
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
|
||||
chunk.export(temp_audio.name, format="mp3")
|
||||
temp_audio.flush()
|
||||
|
||||
try:
|
||||
file_size = os.path.getsize(temp_audio.name)
|
||||
|
||||
with open(temp_audio.name, 'rb') as audio_file:
|
||||
transcription = self.model_variables.transcription_model.transcribe(
|
||||
file=audio_file,
|
||||
language=self.document_version.language,
|
||||
response_format='verbose_json',
|
||||
duration=segment_duration
|
||||
)
|
||||
if transcription:
|
||||
trans = ""
|
||||
# Handle the transcription result based on its type
|
||||
if isinstance(transcription, str):
|
||||
trans = transcription
|
||||
elif hasattr(transcription, 'text'):
|
||||
trans = transcription.text
|
||||
else:
|
||||
transcriptions.append(str(transcription))
|
||||
|
||||
transcriptions.append(trans)
|
||||
|
||||
self._log_tuning("_transcribe_audio", {
|
||||
"Chunk Nr": f"{i + 1} of {total_chunks}",
|
||||
"Segment Duration": segment_duration,
|
||||
"Transcription": trans,
|
||||
})
|
||||
else:
|
||||
self._log("Warning: Received empty transcription", level='warning')
|
||||
self._log_tuning("_transcribe_audio", {"ERROR": "No transcription"})
|
||||
|
||||
except Exception as e:
|
||||
self._log(f"Error during transcription: {str(e)}", level='error')
|
||||
finally:
|
||||
os.unlink(temp_audio.name)
|
||||
|
||||
full_transcription = " ".join(filter(None, transcriptions))
|
||||
|
||||
if not full_transcription:
|
||||
self._log("Warning: No transcription was generated", level='warning')
|
||||
full_transcription = "No transcription available."
|
||||
|
||||
# Save transcription to MinIO
|
||||
transcription_filename = f"{self.document_version.id}_transcription.txt"
|
||||
minio_client.upload_document_file(
|
||||
self.tenant.id,
|
||||
self.document_version.doc_id,
|
||||
self.document_version.language,
|
||||
self.document_version.id,
|
||||
transcription_filename,
|
||||
full_transcription.encode('utf-8')
|
||||
)
|
||||
self._log_tuning(f"Saved transcription to MinIO: {transcription_filename}")
|
||||
|
||||
return full_transcription
|
||||
|
||||
|
||||
# Register the processor
|
||||
ProcessorRegistry.register("AUDIO_PROCESSOR", AudioProcessor)
|
||||
88
eveai_workers/processors/base_processor.py
Normal file
88
eveai_workers/processors/base_processor.py
Normal file
@@ -0,0 +1,88 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Any
|
||||
|
||||
from flask import current_app
|
||||
from common.extensions import minio_client
|
||||
from config.logging_config import TuningLogger
|
||||
|
||||
|
||||
class BaseProcessor(ABC):
|
||||
def __init__(self, tenant, model_variables, document_version, catalog, processor):
|
||||
self.tenant = tenant
|
||||
self.model_variables = model_variables
|
||||
self.document_version = document_version
|
||||
self.catalog = catalog
|
||||
self.processor = processor
|
||||
self.tuning = processor.tuning if processor else False
|
||||
self.tuning_logger = None
|
||||
self._setup_tuning_logger()
|
||||
|
||||
self._log_tuning("Processor initialized", {
|
||||
"processor_type": processor.type if processor else None,
|
||||
"document_version": document_version.id if document_version else None,
|
||||
"catalog": catalog.id if catalog else None
|
||||
})
|
||||
|
||||
def _setup_tuning_logger(self):
|
||||
try:
|
||||
self.tuning_logger = TuningLogger(
|
||||
'tuning',
|
||||
tenant_id=self.tenant.id if self.tenant else None,
|
||||
catalog_id=self.catalog.id if self.catalog else None,
|
||||
processor_id=self.processor.id if self.processor else None,
|
||||
)
|
||||
# Verify logger is working with a test message
|
||||
if self.tuning:
|
||||
self.tuning_logger.log_tuning('processor', "Tuning logger initialized")
|
||||
except Exception as e:
|
||||
current_app.logger.error(f"Failed to setup tuning logger: {str(e)}")
|
||||
raise
|
||||
|
||||
@abstractmethod
|
||||
def process(self):
|
||||
pass
|
||||
|
||||
def _save_markdown(self, markdown):
|
||||
markdown_filename = f"{self.document_version.id}.md"
|
||||
minio_client.upload_document_file(
|
||||
self.tenant.id,
|
||||
self.document_version.doc_id,
|
||||
self.document_version.language,
|
||||
self.document_version.id,
|
||||
markdown_filename,
|
||||
markdown.encode('utf-8')
|
||||
)
|
||||
|
||||
def _log(self, message, level='debug'):
|
||||
logger = current_app.logger
|
||||
log_method = getattr(logger, level)
|
||||
log_method(
|
||||
f"{self.__class__.__name__} - Tenant {self.tenant.id}, Document {self.document_version.id}: {message}")
|
||||
|
||||
def _save_intermediate(self, content, filename):
|
||||
minio_client.upload_document_file(
|
||||
self.tenant.id,
|
||||
self.document_version.doc_id,
|
||||
self.document_version.language,
|
||||
self.document_version.id,
|
||||
filename,
|
||||
content.encode('utf-8')
|
||||
)
|
||||
|
||||
def _clean_markdown(self, markdown):
|
||||
markdown = markdown.strip()
|
||||
if markdown.startswith("```markdown"):
|
||||
markdown = markdown[len("```markdown"):].strip()
|
||||
if markdown.endswith("```"):
|
||||
markdown = markdown[:-3].strip()
|
||||
|
||||
return markdown
|
||||
|
||||
def _log_tuning(self, message: str, data: Dict[str, Any] = None) -> None:
|
||||
if self.tuning and self.tuning_logger:
|
||||
try:
|
||||
self.tuning_logger.log_tuning('processor', message, data)
|
||||
except Exception as e:
|
||||
current_app.logger.error(f"Processor: Error in tuning logging: {e}")
|
||||
|
||||
|
||||
163
eveai_workers/processors/html_processor.py
Normal file
163
eveai_workers/processors/html_processor.py
Normal file
@@ -0,0 +1,163 @@
|
||||
from bs4 import BeautifulSoup
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.prompts import ChatPromptTemplate
|
||||
from langchain_core.runnables import RunnablePassthrough
|
||||
from common.extensions import db, minio_client
|
||||
from common.utils.model_utils import create_language_template
|
||||
from .base_processor import BaseProcessor
|
||||
from common.utils.business_event_context import current_event
|
||||
from .processor_registry import ProcessorRegistry
|
||||
from common.utils.string_list_converter import StringListConverter as SLC
|
||||
|
||||
|
||||
class HTMLProcessor(BaseProcessor):
|
||||
def __init__(self, tenant, model_variables, document_version, catalog, processor):
|
||||
super().__init__(tenant, model_variables, document_version, catalog, processor)
|
||||
cat_conf = catalog.configuration
|
||||
proc_conf = processor.configuration
|
||||
self.html_tags = SLC.string_to_list(proc_conf['html_tags'])
|
||||
self.html_end_tags = SLC.string_to_list(proc_conf['html_end_tags'])
|
||||
self.html_included_elements = SLC.string_to_list(proc_conf['html_included_elements'])
|
||||
self.html_excluded_elements = SLC.string_to_list(proc_conf['html_excluded_elements'])
|
||||
self.html_excluded_classes = SLC.string_to_list(proc_conf['html_excluded_classes'])
|
||||
self.tuning = self.processor.tuning
|
||||
# Add verification logging
|
||||
self._log(f"HTML Processor initialized with tuning={self.tuning}")
|
||||
if self.tuning:
|
||||
self._log_tuning("HTML Processor initialized", {
|
||||
"html_tags": self.html_tags,
|
||||
"html_end_tags": self.html_end_tags,
|
||||
"included_elements": self.html_included_elements,
|
||||
"excluded_elements": self.html_excluded_elements
|
||||
})
|
||||
|
||||
self.chunk_size = catalog.max_chunk_size
|
||||
|
||||
def process(self):
|
||||
self._log("Starting HTML processing")
|
||||
try:
|
||||
file_data = minio_client.download_document_file(
|
||||
self.tenant.id,
|
||||
self.document_version.bucket_name,
|
||||
self.document_version.object_name,
|
||||
)
|
||||
html_content = file_data.decode('utf-8')
|
||||
|
||||
with current_event.create_span("HTML Content Extraction"):
|
||||
extracted_html, title = self._parse_html(html_content)
|
||||
with current_event.create_span("Markdown Generation"):
|
||||
markdown = self._generate_markdown_from_html(extracted_html)
|
||||
|
||||
self._save_markdown(markdown)
|
||||
self._log("Finished processing HTML")
|
||||
return markdown, title
|
||||
except Exception as e:
|
||||
self._log(f"Error processing HTML: {str(e)}", level='error')
|
||||
raise
|
||||
|
||||
def _parse_html(self, html_content):
|
||||
self._log(f'Parsing HTML for tenant {self.tenant.id}')
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
extracted_html = ''
|
||||
excluded_classes = self._parse_excluded_classes(self.html_excluded_classes)
|
||||
|
||||
if self.html_included_elements:
|
||||
elements_to_parse = soup.find_all(self.html_included_elements)
|
||||
else:
|
||||
elements_to_parse = [soup]
|
||||
|
||||
for element in elements_to_parse:
|
||||
for sub_element in element.find_all(self.html_tags):
|
||||
if self._should_exclude_element(sub_element, excluded_classes):
|
||||
continue
|
||||
extracted_html += self._extract_element_content(sub_element)
|
||||
|
||||
title = soup.find('title').get_text(strip=True) if soup.find('title') else ''
|
||||
|
||||
self._log(f'Finished parsing HTML for tenant {self.tenant.id}')
|
||||
self._log_tuning("_parse_html", {"extracted_html": extracted_html, "title": title})
|
||||
return extracted_html, title
|
||||
|
||||
def _generate_markdown_from_html(self, html_content):
|
||||
self._log(f'Generating markdown from HTML for tenant {self.tenant.id}')
|
||||
|
||||
llm = self.model_variables.get_llm()
|
||||
template = self.model_variables.get_template("html_parse")
|
||||
parse_prompt = ChatPromptTemplate.from_template(template)
|
||||
setup = RunnablePassthrough()
|
||||
output_parser = StrOutputParser()
|
||||
chain = setup | parse_prompt | llm | output_parser
|
||||
|
||||
soup = BeautifulSoup(html_content, 'lxml')
|
||||
chunks = self._split_content(soup, self.chunk_size)
|
||||
|
||||
markdown_chunks = []
|
||||
for chunk in chunks:
|
||||
input_html = {"html": chunk}
|
||||
markdown_chunk = chain.invoke(input_html)
|
||||
markdown_chunks.append(markdown_chunk)
|
||||
self._log_tuning("_generate_markdown_from_html", {"chunk": chunk, "markdown_chunk": markdown_chunk})
|
||||
|
||||
markdown = "\n\n".join(markdown_chunks)
|
||||
self._log(f'Finished generating markdown from HTML for tenant {self.tenant.id}')
|
||||
return markdown
|
||||
|
||||
def _split_content(self, soup, max_size=20000):
|
||||
chunks = []
|
||||
current_chunk = []
|
||||
current_size = 0
|
||||
|
||||
for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span', 'table']):
|
||||
element_html = str(element)
|
||||
element_size = len(element_html)
|
||||
|
||||
if current_size + element_size > max_size and current_chunk:
|
||||
chunks.append(''.join(map(str, current_chunk)))
|
||||
current_chunk = []
|
||||
current_size = 0
|
||||
|
||||
current_chunk.append(element)
|
||||
current_size += element_size
|
||||
|
||||
if element.name in ['h1', 'h2', 'h3'] and current_size > max_size:
|
||||
chunks.append(''.join(map(str, current_chunk)))
|
||||
current_chunk = []
|
||||
current_size = 0
|
||||
|
||||
if current_chunk:
|
||||
chunks.append(''.join(map(str, current_chunk)))
|
||||
|
||||
return chunks
|
||||
|
||||
def _parse_excluded_classes(self, excluded_classes):
|
||||
parsed = {}
|
||||
if excluded_classes:
|
||||
for rule in excluded_classes:
|
||||
element, cls = rule.split('.', 1)
|
||||
parsed.setdefault(element, set()).add(cls)
|
||||
return parsed
|
||||
|
||||
def _should_exclude_element(self, element, excluded_classes):
|
||||
if self.html_excluded_elements and element.find_parent(self.html_excluded_elements):
|
||||
return True
|
||||
return self._is_element_excluded_by_class(element, excluded_classes)
|
||||
|
||||
def _is_element_excluded_by_class(self, element, excluded_classes):
|
||||
for parent in element.parents:
|
||||
if self._element_matches_exclusion(parent, excluded_classes):
|
||||
return True
|
||||
return self._element_matches_exclusion(element, excluded_classes)
|
||||
|
||||
def _element_matches_exclusion(self, element, excluded_classes):
|
||||
if '*' in excluded_classes and any(cls in excluded_classes['*'] for cls in element.get('class', [])):
|
||||
return True
|
||||
return element.name in excluded_classes and \
|
||||
any(cls in excluded_classes[element.name] for cls in element.get('class', []))
|
||||
|
||||
def _extract_element_content(self, element):
|
||||
content = ' '.join(child.strip() for child in element.stripped_strings)
|
||||
return f'<{element.name}>{content}</{element.name}>\n'
|
||||
|
||||
|
||||
# Register the processor
|
||||
ProcessorRegistry.register("HTML_PROCESSOR", HTMLProcessor)
|
||||
231
eveai_workers/processors/pdf_processor.py
Normal file
231
eveai_workers/processors/pdf_processor.py
Normal file
@@ -0,0 +1,231 @@
|
||||
import io
|
||||
import pdfplumber
|
||||
from flask import current_app
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.prompts import ChatPromptTemplate
|
||||
import re
|
||||
from langchain_core.runnables import RunnablePassthrough
|
||||
|
||||
from common.extensions import minio_client
|
||||
from common.utils.model_utils import create_language_template
|
||||
from .base_processor import BaseProcessor
|
||||
from common.utils.business_event_context import current_event
|
||||
from .processor_registry import ProcessorRegistry
|
||||
|
||||
|
||||
class PDFProcessor(BaseProcessor):
|
||||
def __init__(self, tenant, model_variables, document_version, catalog, processor):
|
||||
super().__init__(tenant, model_variables, document_version, catalog, processor)
|
||||
|
||||
self.chunk_size = catalog.max_chunk_size
|
||||
self.chunk_overlap = 0
|
||||
self.tuning = self.processor.tuning
|
||||
|
||||
def process(self):
|
||||
self._log("Starting PDF processing")
|
||||
try:
|
||||
file_data = minio_client.download_document_file(
|
||||
self.tenant.id,
|
||||
self.document_version.bucket_name,
|
||||
self.document_version.object_name,
|
||||
)
|
||||
|
||||
with current_event.create_span("PDF Extraction"):
|
||||
extracted_content = self._extract_content(file_data)
|
||||
structured_content, title = self._structure_content(extracted_content)
|
||||
|
||||
with current_event.create_span("Markdown Generation"):
|
||||
llm_chunks = self._split_content_for_llm(structured_content)
|
||||
markdown = self._process_chunks_with_llm(llm_chunks)
|
||||
|
||||
self._save_markdown(markdown)
|
||||
self._log("Finished processing PDF")
|
||||
return markdown, title
|
||||
except Exception as e:
|
||||
self._log(f"Error processing PDF: {str(e)}", level='error')
|
||||
raise
|
||||
|
||||
def _extract_content(self, file_data):
|
||||
extracted_content = []
|
||||
with pdfplumber.open(io.BytesIO(file_data)) as pdf:
|
||||
figure_counter = 1
|
||||
for page_num, page in enumerate(pdf.pages):
|
||||
self._log(f"Extracting content from page {page_num + 1}")
|
||||
page_content = {
|
||||
'text': page.extract_text(),
|
||||
'figures': self._extract_figures(page, page_num, figure_counter),
|
||||
'tables': self._extract_tables(page)
|
||||
}
|
||||
self._log_tuning("_extract_content", {"page_num": page_num, "page_content": page_content})
|
||||
figure_counter += len(page_content['figures'])
|
||||
extracted_content.append(page_content)
|
||||
|
||||
return extracted_content
|
||||
|
||||
def _extract_figures(self, page, page_num, figure_counter):
|
||||
figures = []
|
||||
# Omit figure processing for now!
|
||||
# for img in page.images:
|
||||
# try:
|
||||
# # Try to get the bbox, use full page dimensions if not available
|
||||
# bbox = img.get('bbox', (0, 0, page.width, page.height))
|
||||
#
|
||||
# figure = {
|
||||
# 'figure_number': figure_counter,
|
||||
# 'filename': f"figure_{page_num + 1}_{figure_counter}.png",
|
||||
# 'caption': self._find_figure_caption(page, bbox)
|
||||
# }
|
||||
#
|
||||
# # Extract the figure as an image
|
||||
# figure_image = page.within_bbox(bbox).to_image()
|
||||
#
|
||||
# # Save the figure using MinIO
|
||||
# with io.BytesIO() as output:
|
||||
# figure_image.save(output, format='PNG')
|
||||
# output.seek(0)
|
||||
# minio_client.upload_document_file(
|
||||
# self.tenant.id,
|
||||
# self.document_version.doc_id,
|
||||
# self.document_version.language,
|
||||
# self.document_version.id,
|
||||
# figure['filename'],
|
||||
# output.getvalue()
|
||||
# )
|
||||
#
|
||||
# figures.append(figure)
|
||||
# figure_counter += 1
|
||||
# except Exception as e:
|
||||
# self._log(f"Error processing figure on page {page_num + 1}: {str(e)}", level='error')
|
||||
|
||||
return figures
|
||||
|
||||
def _find_figure_caption(self, page, bbox):
|
||||
try:
|
||||
# Look for text below the figure
|
||||
caption_bbox = (bbox[0], bbox[3], bbox[2], min(bbox[3] + 50, page.height))
|
||||
caption_text = page.crop(caption_bbox).extract_text()
|
||||
if caption_text and caption_text.lower().startswith('figure'):
|
||||
return caption_text
|
||||
except Exception as e:
|
||||
self._log(f"Error finding figure caption: {str(e)}", level='error')
|
||||
return None
|
||||
|
||||
def _extract_tables(self, page):
|
||||
tables = []
|
||||
try:
|
||||
for table in page.extract_tables():
|
||||
if table:
|
||||
markdown_table = self._table_to_markdown(table)
|
||||
if markdown_table: # Only add non-empty tables
|
||||
tables.append(markdown_table)
|
||||
self._log_tuning("_extract_tables", {"markdown_table": markdown_table})
|
||||
except Exception as e:
|
||||
self._log(f"Error extracting tables from page: {str(e)}", level='error')
|
||||
return tables
|
||||
|
||||
def _table_to_markdown(self, table):
|
||||
if not table or not table[0]: # Check if table is empty or first row is empty
|
||||
return "" # Return empty string for empty tables
|
||||
|
||||
def clean_cell(cell):
|
||||
if cell is None:
|
||||
return "" # Convert None to empty string
|
||||
return str(cell).replace("|", "\\|") # Escape pipe characters and convert to string
|
||||
|
||||
header = [clean_cell(cell) for cell in table[0]]
|
||||
markdown = "| " + " | ".join(header) + " |\n"
|
||||
markdown += "| " + " | ".join(["---"] * len(header)) + " |\n"
|
||||
|
||||
for row in table[1:]:
|
||||
cleaned_row = [clean_cell(cell) for cell in row]
|
||||
markdown += "| " + " | ".join(cleaned_row) + " |\n"
|
||||
|
||||
return markdown
|
||||
|
||||
def _structure_content(self, extracted_content):
|
||||
structured_content = ""
|
||||
title = "Untitled Document"
|
||||
current_heading_level = 0
|
||||
heading_pattern = re.compile(r'^(\d+(\.\d+)*\.?\s*)?(.+)$')
|
||||
|
||||
def identify_heading(text):
|
||||
match = heading_pattern.match(text.strip())
|
||||
if match:
|
||||
numbering, _, content = match.groups()
|
||||
if numbering:
|
||||
level = numbering.count('.') + 1
|
||||
return level, f"{numbering}{content}"
|
||||
else:
|
||||
return 1, content # Assume it's a top-level heading if no numbering
|
||||
return 0, text # Not a heading
|
||||
|
||||
for page in extracted_content:
|
||||
# Assume the title is on the first page
|
||||
if page == extracted_content[0]:
|
||||
lines = page.get('text', '').split('\n')
|
||||
if lines:
|
||||
title = lines[0].strip() # Use the first non-empty line as the title
|
||||
|
||||
# Process text
|
||||
paragraphs = page['text'].split('\n\n')
|
||||
|
||||
for para in paragraphs:
|
||||
lines = para.strip().split('\n')
|
||||
if len(lines) == 1: # Potential heading
|
||||
level, text = identify_heading(lines[0])
|
||||
if level > 0:
|
||||
heading_marks = '#' * level
|
||||
structured_content += f"\n\n{heading_marks} {text}\n\n"
|
||||
if level == 1 and not title:
|
||||
title = text # Use the first top-level heading as the title if not set
|
||||
else:
|
||||
structured_content += f"{para}\n\n" # Treat as normal paragraph
|
||||
else:
|
||||
structured_content += f"{para}\n\n" # Multi-line paragraph
|
||||
|
||||
# Process figures
|
||||
for figure in page.get('figures', []):
|
||||
structured_content += f"\n\n![Figure {figure['figure_number']}]({figure['filename']})\n\n"
|
||||
if figure['caption']:
|
||||
structured_content += f"*Figure {figure['figure_number']}: {figure['caption']}*\n\n"
|
||||
|
||||
# Add tables
|
||||
if 'tables' in page:
|
||||
for table in page['tables']:
|
||||
structured_content += f"\n{table}\n"
|
||||
|
||||
if self.tuning:
|
||||
self._save_intermediate(structured_content, "structured_content.md")
|
||||
|
||||
return structured_content, title
|
||||
|
||||
def _split_content_for_llm(self, content):
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=self.chunk_size,
|
||||
chunk_overlap=self.chunk_overlap,
|
||||
length_function=len,
|
||||
separators=["\n\n", "\n", " ", ""]
|
||||
)
|
||||
return text_splitter.split_text(content)
|
||||
|
||||
def _process_chunks_with_llm(self, chunks):
|
||||
llm = self.model_variables.get_llm()
|
||||
template = self.model_variables.get_template('pdf_parse')
|
||||
pdf_prompt = ChatPromptTemplate.from_template(template)
|
||||
setup = RunnablePassthrough()
|
||||
output_parser = StrOutputParser()
|
||||
chain = setup | pdf_prompt | llm | output_parser
|
||||
|
||||
markdown_chunks = []
|
||||
for chunk in chunks:
|
||||
input = {"pdf_content": chunk}
|
||||
result = chain.invoke(input)
|
||||
result = self._clean_markdown(result)
|
||||
markdown_chunks.append(result)
|
||||
|
||||
return "\n\n".join(markdown_chunks)
|
||||
|
||||
|
||||
# Register the processor
|
||||
ProcessorRegistry.register("PDF_PROCESSOR", PDFProcessor)
|
||||
92
eveai_workers/processors/processor_registry.py
Normal file
92
eveai_workers/processors/processor_registry.py
Normal file
@@ -0,0 +1,92 @@
|
||||
from typing import Dict, Type, Optional
|
||||
from flask import current_app
|
||||
from config.processor_types import PROCESSOR_TYPES
|
||||
from .base_processor import BaseProcessor
|
||||
|
||||
|
||||
class ProcessorRegistry:
|
||||
"""Registry for processor types that aligns with PROCESSOR_TYPES configuration"""
|
||||
|
||||
_registry: Dict[str, Type[BaseProcessor]] = {}
|
||||
|
||||
@classmethod
|
||||
def register(cls, processor_type: str, processor_class: Type[BaseProcessor]):
|
||||
"""
|
||||
Register a new processor type that must match a type in PROCESSOR_TYPES
|
||||
|
||||
Args:
|
||||
processor_type: Type identifier from PROCESSOR_TYPES
|
||||
processor_class: Processor implementation class
|
||||
|
||||
Raises:
|
||||
ValueError: If processor_type isn't defined in PROCESSOR_TYPES
|
||||
"""
|
||||
if processor_type not in PROCESSOR_TYPES:
|
||||
raise ValueError(f"Processor type {processor_type} not found in PROCESSOR_TYPES configuration")
|
||||
|
||||
cls._registry[processor_type] = processor_class
|
||||
|
||||
@classmethod
|
||||
def get_processor_class(cls, processor_type: str) -> Type[BaseProcessor]:
|
||||
"""
|
||||
Get the processor class for a given processor type
|
||||
|
||||
Args:
|
||||
processor_type: Type identifier from PROCESSOR_TYPES
|
||||
|
||||
Returns:
|
||||
The registered processor class
|
||||
|
||||
Raises:
|
||||
ValueError: If no processor is registered for the given type
|
||||
"""
|
||||
if processor_type not in cls._registry:
|
||||
raise ValueError(f"No processor registered for type: {processor_type}")
|
||||
return cls._registry[processor_type]
|
||||
|
||||
@classmethod
|
||||
def get_processor_for_file_type(cls, file_type: str) -> tuple[str, Type[BaseProcessor]]:
|
||||
"""
|
||||
Find appropriate processor for a file type by checking PROCESSOR_TYPES definitions
|
||||
|
||||
Args:
|
||||
file_type: File extension (e.g., 'html', 'pdf')
|
||||
|
||||
Returns:
|
||||
Tuple of (processor_type, processor_class)
|
||||
|
||||
Raises:
|
||||
ValueError: If no processor is found for the file type
|
||||
"""
|
||||
# First find which processor type handles this file type
|
||||
for proc_type, config in PROCESSOR_TYPES.items():
|
||||
# Check if file_type is in the supported file_types (handling both string and list formats)
|
||||
supported_types = config['file_types']
|
||||
if isinstance(supported_types, str):
|
||||
supported_types = [t.strip() for t in supported_types.split(',')]
|
||||
|
||||
if file_type in supported_types:
|
||||
# Get the registered processor class for this type
|
||||
if proc_type in cls._registry:
|
||||
return proc_type, cls._registry[proc_type]
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Found processor type {proc_type} for file type {file_type} but no processor is registered")
|
||||
|
||||
raise ValueError(f"No processor type found for file type: {file_type}")
|
||||
|
||||
@classmethod
|
||||
def validate_processor_registration(cls):
|
||||
"""
|
||||
Validate that all PROCESSOR_TYPES have registered processors
|
||||
|
||||
Raises:
|
||||
ValueError: If any processor type lacks a registered processor
|
||||
"""
|
||||
missing_processors = []
|
||||
for proc_type in PROCESSOR_TYPES.keys():
|
||||
if proc_type not in cls._registry:
|
||||
missing_processors.append(proc_type)
|
||||
|
||||
if missing_processors:
|
||||
raise ValueError(f"Missing processor registrations for: {', '.join(missing_processors)}")
|
||||
32
eveai_workers/processors/srt_processor.py
Normal file
32
eveai_workers/processors/srt_processor.py
Normal file
@@ -0,0 +1,32 @@
|
||||
from common.extensions import minio_client
|
||||
from .transcription_processor import TranscriptionBaseProcessor
|
||||
import re
|
||||
|
||||
|
||||
class SRTProcessor(TranscriptionBaseProcessor):
|
||||
def _get_transcription(self):
|
||||
file_data = minio_client.download_document_file(
|
||||
self.tenant.id,
|
||||
self.document_version.bucket_name,
|
||||
self.document_version.object_name,
|
||||
)
|
||||
srt_content = file_data.decode('utf-8')
|
||||
return self._clean_srt(srt_content)
|
||||
|
||||
def _clean_srt(self, srt_content):
|
||||
# Remove timecodes and subtitle numbers
|
||||
cleaned_lines = []
|
||||
for line in srt_content.split('\n'):
|
||||
# Skip empty lines, subtitle numbers, and timecodes
|
||||
if line.strip() and not line.strip().isdigit() and not re.match(
|
||||
r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', line):
|
||||
cleaned_lines.append(line.strip())
|
||||
|
||||
# Join the cleaned lines
|
||||
cleaned_text = ' '.join(cleaned_lines)
|
||||
|
||||
# Remove any extra spaces
|
||||
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
|
||||
|
||||
return cleaned_text
|
||||
|
||||
98
eveai_workers/processors/transcription_processor.py
Normal file
98
eveai_workers/processors/transcription_processor.py
Normal file
@@ -0,0 +1,98 @@
|
||||
# transcription_processor.py
|
||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.prompts import ChatPromptTemplate
|
||||
from langchain_core.runnables import RunnablePassthrough
|
||||
|
||||
from common.utils.model_utils import create_language_template
|
||||
from .base_processor import BaseProcessor
|
||||
from common.utils.business_event_context import current_event
|
||||
|
||||
|
||||
class TranscriptionBaseProcessor(BaseProcessor):
|
||||
def __init__(self, tenant, model_variables, document_version, catalog, processor):
|
||||
super().__init__(tenant, model_variables, document_version, catalog, processor)
|
||||
self.annotation_chunk_size = model_variables.annotation_chunk_length
|
||||
self.annotation_chunk_overlap = 0
|
||||
|
||||
def process(self):
|
||||
self._log("Starting Transcription processing")
|
||||
try:
|
||||
with current_event.create_span("Transcription Generation"):
|
||||
transcription = self._get_transcription()
|
||||
with current_event.create_span("Markdown Generation"):
|
||||
chunks = self._chunk_transcription(transcription)
|
||||
markdown_chunks = self._process_chunks(chunks)
|
||||
full_markdown = self._combine_markdown_chunks(markdown_chunks)
|
||||
self._save_markdown(full_markdown)
|
||||
self._log("Finished processing Transcription")
|
||||
return full_markdown, self._extract_title_from_markdown(full_markdown)
|
||||
except Exception as e:
|
||||
self._log(f"Error processing Transcription: {str(e)}", level='error')
|
||||
raise
|
||||
|
||||
def _get_transcription(self):
|
||||
# This method should be implemented by child classes
|
||||
raise NotImplementedError
|
||||
|
||||
def _chunk_transcription(self, transcription):
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=self.annotation_chunk_size,
|
||||
chunk_overlap=self.annotation_chunk_overlap,
|
||||
length_function=len,
|
||||
separators=["\n\n", "\n", " ", ""]
|
||||
)
|
||||
return text_splitter.split_text(transcription)
|
||||
|
||||
def _process_chunks(self, chunks):
|
||||
self._log_tuning("_process_chunks", {"Nr of Chunks": len(chunks)})
|
||||
llm = self.model_variables.get_llm()
|
||||
template = self.model_variables.get_template('transcript')
|
||||
language_template = create_language_template(template, self.document_version.language)
|
||||
transcript_prompt = ChatPromptTemplate.from_template(language_template)
|
||||
setup = RunnablePassthrough()
|
||||
output_parser = StrOutputParser()
|
||||
|
||||
chain = setup | transcript_prompt | llm | output_parser
|
||||
|
||||
markdown_chunks = []
|
||||
previous_part = ""
|
||||
for i, chunk in enumerate(chunks):
|
||||
input_transcript = {
|
||||
'transcript': chunk,
|
||||
'previous_part': previous_part
|
||||
}
|
||||
markdown = chain.invoke(input_transcript)
|
||||
markdown = self._clean_markdown(markdown)
|
||||
self._log_tuning("_process_chunks", {
|
||||
"Chunk Number": f"{i + 1} of {len(chunks)}",
|
||||
"Chunk": chunk,
|
||||
"Previous Chunk": previous_part,
|
||||
"Markdown": markdown,
|
||||
})
|
||||
markdown_chunks.append(markdown)
|
||||
|
||||
# Extract the last part for the next iteration
|
||||
lines = markdown.split('\n')
|
||||
last_header = None
|
||||
for line in reversed(lines):
|
||||
if line.startswith('#'):
|
||||
last_header = line
|
||||
break
|
||||
if last_header:
|
||||
header_index = lines.index(last_header)
|
||||
previous_part = '\n'.join(lines[header_index:])
|
||||
else:
|
||||
previous_part = lines[-1] if lines else ""
|
||||
|
||||
return markdown_chunks
|
||||
|
||||
def _combine_markdown_chunks(self, markdown_chunks):
|
||||
return "\n\n".join(markdown_chunks)
|
||||
|
||||
def _extract_title_from_markdown(self, markdown):
|
||||
lines = markdown.split('\n')
|
||||
for line in lines:
|
||||
if line.startswith('# '):
|
||||
return line[2:].strip()
|
||||
return "Untitled Transcription"
|
||||
Reference in New Issue
Block a user