- Improvements on document uploads (accept other files than html-files when entering a URL)
- Introduction of API-functionality (to be continued). Deduplication of document and url uploads between views and api. - Improvements on document processing - introduction of processor classes to streamline document inputs - Removed pure Youtube functionality, as Youtube retrieval of documents continuously changes. But added upload of srt, mp3, ogg and mp4
This commit is contained in:
187
eveai_workers/Processors/audio_processor.py
Normal file
187
eveai_workers/Processors/audio_processor.py
Normal file
@@ -0,0 +1,187 @@
|
||||
import io
|
||||
import os
|
||||
from pydub import AudioSegment
|
||||
import tempfile
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.prompts import ChatPromptTemplate
|
||||
from langchain_core.runnables import RunnablePassthrough
|
||||
from common.extensions import minio_client
|
||||
from common.utils.model_utils import create_language_template
|
||||
from .processor import Processor
|
||||
import subprocess
|
||||
|
||||
|
||||
class AudioProcessor(Processor):
|
||||
def __init__(self, tenant, model_variables, document_version):
|
||||
super().__init__(tenant, model_variables, document_version)
|
||||
self.transcription_client = model_variables['transcription_client']
|
||||
self.transcription_model = model_variables['transcription_model']
|
||||
self.ffmpeg_path = 'ffmpeg'
|
||||
|
||||
|
||||
def process(self):
|
||||
self._log("Starting Audio processing")
|
||||
try:
|
||||
file_data = minio_client.download_document_file(
|
||||
self.tenant.id,
|
||||
self.document_version.doc_id,
|
||||
self.document_version.language,
|
||||
self.document_version.id,
|
||||
self.document_version.file_name
|
||||
)
|
||||
|
||||
compressed_audio = self._compress_audio(file_data)
|
||||
transcription = self._transcribe_audio(compressed_audio)
|
||||
markdown, title = self._generate_markdown_from_transcription(transcription)
|
||||
|
||||
self._save_markdown(markdown)
|
||||
self._log("Finished processing Audio")
|
||||
return markdown, title
|
||||
except Exception as e:
|
||||
self._log(f"Error processing Audio: {str(e)}", level='error')
|
||||
raise
|
||||
|
||||
def _compress_audio(self, audio_data):
|
||||
self._log("Compressing audio")
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{self.document_version.file_type}') as temp_input:
|
||||
temp_input.write(audio_data)
|
||||
temp_input.flush()
|
||||
|
||||
# Use a unique filename for the output to avoid conflicts
|
||||
output_filename = f'compressed_{os.urandom(8).hex()}.mp3'
|
||||
output_path = os.path.join(tempfile.gettempdir(), output_filename)
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[self.ffmpeg_path, '-y', '-i', temp_input.name, '-b:a', '64k', '-f', 'mp3', output_path],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True
|
||||
)
|
||||
|
||||
with open(output_path, 'rb') as f:
|
||||
compressed_data = f.read()
|
||||
|
||||
# Save compressed audio to MinIO
|
||||
compressed_filename = f"{self.document_version.id}_compressed.mp3"
|
||||
minio_client.upload_document_file(
|
||||
self.tenant.id,
|
||||
self.document_version.doc_id,
|
||||
self.document_version.language,
|
||||
self.document_version.id,
|
||||
compressed_filename,
|
||||
compressed_data
|
||||
)
|
||||
self._log(f"Saved compressed audio to MinIO: {compressed_filename}")
|
||||
|
||||
return compressed_data
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
error_message = f"Compression failed: {e.stderr}"
|
||||
self._log(error_message, level='error')
|
||||
raise Exception(error_message)
|
||||
|
||||
finally:
|
||||
# Clean up temporary files
|
||||
os.unlink(temp_input.name)
|
||||
if os.path.exists(output_path):
|
||||
os.unlink(output_path)
|
||||
|
||||
def _transcribe_audio(self, audio_data):
|
||||
self._log("Starting audio transcription")
|
||||
audio = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
|
||||
|
||||
segment_length = 10 * 60 * 1000 # 10 minutes in milliseconds
|
||||
transcriptions = []
|
||||
|
||||
for i, chunk in enumerate(audio[::segment_length]):
|
||||
self._log(f'Processing chunk {i + 1} of {len(audio) // segment_length + 1}')
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
|
||||
chunk.export(temp_audio.name, format="mp3")
|
||||
temp_audio.flush()
|
||||
|
||||
try:
|
||||
file_size = os.path.getsize(temp_audio.name)
|
||||
self._log(f"Temporary audio file size: {file_size} bytes")
|
||||
|
||||
with open(temp_audio.name, 'rb') as audio_file:
|
||||
file_start = audio_file.read(100)
|
||||
self._log(f"First 100 bytes of audio file: {file_start}")
|
||||
audio_file.seek(0) # Reset file pointer to the beginning
|
||||
|
||||
self._log("Calling transcription API")
|
||||
transcription = self.transcription_client.audio.transcriptions.create(
|
||||
file=audio_file,
|
||||
model=self.transcription_model,
|
||||
language=self.document_version.language,
|
||||
response_format='verbose_json',
|
||||
)
|
||||
self._log("Transcription API call completed")
|
||||
|
||||
if transcription:
|
||||
# Handle the transcription result based on its type
|
||||
if isinstance(transcription, str):
|
||||
self._log(f"Transcription result (string): {transcription[:100]}...")
|
||||
transcriptions.append(transcription)
|
||||
elif hasattr(transcription, 'text'):
|
||||
self._log(
|
||||
f"Transcription result (object with 'text' attribute): {transcription.text[:100]}...")
|
||||
transcriptions.append(transcription.text)
|
||||
else:
|
||||
self._log(f"Transcription result (unknown type): {str(transcription)[:100]}...")
|
||||
transcriptions.append(str(transcription))
|
||||
else:
|
||||
self._log("Warning: Received empty transcription", level='warning')
|
||||
|
||||
except Exception as e:
|
||||
self._log(f"Error during transcription: {str(e)}", level='error')
|
||||
finally:
|
||||
os.unlink(temp_audio.name)
|
||||
|
||||
full_transcription = " ".join(filter(None, transcriptions))
|
||||
|
||||
if not full_transcription:
|
||||
self._log("Warning: No transcription was generated", level='warning')
|
||||
full_transcription = "No transcription available."
|
||||
|
||||
# Save transcription to MinIO
|
||||
transcription_filename = f"{self.document_version.id}_transcription.txt"
|
||||
minio_client.upload_document_file(
|
||||
self.tenant.id,
|
||||
self.document_version.doc_id,
|
||||
self.document_version.language,
|
||||
self.document_version.id,
|
||||
transcription_filename,
|
||||
full_transcription.encode('utf-8')
|
||||
)
|
||||
self._log(f"Saved transcription to MinIO: {transcription_filename}")
|
||||
|
||||
return full_transcription
|
||||
|
||||
def _generate_markdown_from_transcription(self, transcription):
|
||||
self._log("Generating markdown from transcription")
|
||||
llm = self.model_variables['llm']
|
||||
template = self.model_variables['transcript_template']
|
||||
language_template = create_language_template(template, self.document_version.language)
|
||||
transcript_prompt = ChatPromptTemplate.from_template(language_template)
|
||||
setup = RunnablePassthrough()
|
||||
output_parser = StrOutputParser()
|
||||
|
||||
chain = setup | transcript_prompt | llm | output_parser
|
||||
|
||||
input_transcript = {'transcript': transcription}
|
||||
markdown = chain.invoke(input_transcript)
|
||||
|
||||
# Extract title from the markdown
|
||||
title = self._extract_title_from_markdown(markdown)
|
||||
|
||||
return markdown, title
|
||||
|
||||
def _extract_title_from_markdown(self, markdown):
|
||||
# Simple extraction of the first header as the title
|
||||
lines = markdown.split('\n')
|
||||
for line in lines:
|
||||
if line.startswith('# '):
|
||||
return line[2:].strip()
|
||||
return "Untitled Audio Transcription"
|
||||
142
eveai_workers/Processors/html_processor.py
Normal file
142
eveai_workers/Processors/html_processor.py
Normal file
@@ -0,0 +1,142 @@
|
||||
from bs4 import BeautifulSoup
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.prompts import ChatPromptTemplate
|
||||
from langchain_core.runnables import RunnablePassthrough
|
||||
from common.extensions import db, minio_client
|
||||
from common.utils.model_utils import create_language_template
|
||||
from .processor import Processor
|
||||
|
||||
|
||||
class HTMLProcessor(Processor):
|
||||
def __init__(self, tenant, model_variables, document_version):
|
||||
super().__init__(tenant, model_variables, document_version)
|
||||
self.html_tags = model_variables['html_tags']
|
||||
self.html_end_tags = model_variables['html_end_tags']
|
||||
self.html_included_elements = model_variables['html_included_elements']
|
||||
self.html_excluded_elements = model_variables['html_excluded_elements']
|
||||
|
||||
def process(self):
|
||||
self._log("Starting HTML processing")
|
||||
try:
|
||||
file_data = minio_client.download_document_file(
|
||||
self.tenant.id,
|
||||
self.document_version.doc_id,
|
||||
self.document_version.language,
|
||||
self.document_version.id,
|
||||
self.document_version.file_name
|
||||
)
|
||||
html_content = file_data.decode('utf-8')
|
||||
|
||||
extracted_html, title = self._parse_html(html_content)
|
||||
markdown = self._generate_markdown_from_html(extracted_html)
|
||||
|
||||
self._save_markdown(markdown)
|
||||
self._log("Finished processing HTML")
|
||||
return markdown, title
|
||||
except Exception as e:
|
||||
self._log(f"Error processing HTML: {str(e)}", level='error')
|
||||
raise
|
||||
|
||||
def _parse_html(self, html_content):
|
||||
self._log(f'Parsing HTML for tenant {self.tenant.id}')
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
extracted_html = ''
|
||||
excluded_classes = self._parse_excluded_classes(self.tenant.html_excluded_classes)
|
||||
|
||||
if self.html_included_elements:
|
||||
elements_to_parse = soup.find_all(self.html_included_elements)
|
||||
else:
|
||||
elements_to_parse = [soup]
|
||||
|
||||
for element in elements_to_parse:
|
||||
for sub_element in element.find_all(self.html_tags):
|
||||
if self._should_exclude_element(sub_element, excluded_classes):
|
||||
continue
|
||||
extracted_html += self._extract_element_content(sub_element)
|
||||
|
||||
title = soup.find('title').get_text(strip=True) if soup.find('title') else ''
|
||||
|
||||
self._log(f'Finished parsing HTML for tenant {self.tenant.id}')
|
||||
return extracted_html, title
|
||||
|
||||
def _generate_markdown_from_html(self, html_content):
|
||||
self._log(f'Generating markdown from HTML for tenant {self.tenant.id}')
|
||||
|
||||
llm = self.model_variables['llm']
|
||||
template = self.model_variables['html_parse_template']
|
||||
parse_prompt = ChatPromptTemplate.from_template(template)
|
||||
setup = RunnablePassthrough()
|
||||
output_parser = StrOutputParser()
|
||||
chain = setup | parse_prompt | llm | output_parser
|
||||
|
||||
soup = BeautifulSoup(html_content, 'lxml')
|
||||
chunks = self._split_content(soup)
|
||||
|
||||
markdown_chunks = []
|
||||
for chunk in chunks:
|
||||
if self.embed_tuning:
|
||||
self._log(f'Processing chunk: \n{chunk}\n')
|
||||
input_html = {"html": chunk}
|
||||
markdown_chunk = chain.invoke(input_html)
|
||||
markdown_chunks.append(markdown_chunk)
|
||||
if self.embed_tuning:
|
||||
self._log(f'Processed markdown chunk: \n{markdown_chunk}\n')
|
||||
|
||||
markdown = "\n\n".join(markdown_chunks)
|
||||
self._log(f'Finished generating markdown from HTML for tenant {self.tenant.id}')
|
||||
return markdown
|
||||
|
||||
def _split_content(self, soup, max_size=20000):
|
||||
chunks = []
|
||||
current_chunk = []
|
||||
current_size = 0
|
||||
|
||||
for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span', 'table']):
|
||||
element_html = str(element)
|
||||
element_size = len(element_html)
|
||||
|
||||
if current_size + element_size > max_size and current_chunk:
|
||||
chunks.append(''.join(map(str, current_chunk)))
|
||||
current_chunk = []
|
||||
current_size = 0
|
||||
|
||||
current_chunk.append(element)
|
||||
current_size += element_size
|
||||
|
||||
if element.name in ['h1', 'h2', 'h3'] and current_size > max_size:
|
||||
chunks.append(''.join(map(str, current_chunk)))
|
||||
current_chunk = []
|
||||
current_size = 0
|
||||
|
||||
if current_chunk:
|
||||
chunks.append(''.join(map(str, current_chunk)))
|
||||
|
||||
return chunks
|
||||
|
||||
def _parse_excluded_classes(self, excluded_classes):
|
||||
parsed = {}
|
||||
for rule in excluded_classes:
|
||||
element, cls = rule.split('.', 1)
|
||||
parsed.setdefault(element, set()).add(cls)
|
||||
return parsed
|
||||
|
||||
def _should_exclude_element(self, element, excluded_classes):
|
||||
if self.html_excluded_elements and element.find_parent(self.html_excluded_elements):
|
||||
return True
|
||||
return self._is_element_excluded_by_class(element, excluded_classes)
|
||||
|
||||
def _is_element_excluded_by_class(self, element, excluded_classes):
|
||||
for parent in element.parents:
|
||||
if self._element_matches_exclusion(parent, excluded_classes):
|
||||
return True
|
||||
return self._element_matches_exclusion(element, excluded_classes)
|
||||
|
||||
def _element_matches_exclusion(self, element, excluded_classes):
|
||||
if '*' in excluded_classes and any(cls in excluded_classes['*'] for cls in element.get('class', [])):
|
||||
return True
|
||||
return element.name in excluded_classes and \
|
||||
any(cls in excluded_classes[element.name] for cls in element.get('class', []))
|
||||
|
||||
def _extract_element_content(self, element):
|
||||
content = ' '.join(child.strip() for child in element.stripped_strings)
|
||||
return f'<{element.name}>{content}</{element.name}>\n'
|
||||
@@ -5,29 +5,23 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.prompts import ChatPromptTemplate
|
||||
import re
|
||||
|
||||
from langchain_core.runnables import RunnablePassthrough
|
||||
|
||||
from common.extensions import minio_client
|
||||
from common.utils.model_utils import create_language_template
|
||||
from .processor import Processor
|
||||
|
||||
|
||||
class PDFProcessor:
|
||||
class PDFProcessor(Processor):
|
||||
def __init__(self, tenant, model_variables, document_version):
|
||||
self.tenant = tenant
|
||||
self.model_variables = model_variables
|
||||
self.document_version = document_version
|
||||
|
||||
# Configuration parameters from model_variables
|
||||
super().__init__(tenant, model_variables, document_version)
|
||||
# PDF-specific initialization
|
||||
self.chunk_size = model_variables['PDF_chunk_size']
|
||||
self.chunk_overlap = model_variables['PDF_chunk_overlap']
|
||||
self.min_chunk_size = model_variables['PDF_min_chunk_size']
|
||||
self.max_chunk_size = model_variables['PDF_max_chunk_size']
|
||||
|
||||
# Set tuning variable for easy use
|
||||
self.embed_tuning = model_variables['embed_tuning']
|
||||
|
||||
def process_pdf(self):
|
||||
def process(self):
|
||||
self._log("Starting PDF processing")
|
||||
try:
|
||||
file_data = minio_client.download_document_file(
|
||||
@@ -51,11 +45,6 @@ class PDFProcessor:
|
||||
self._log(f"Error processing PDF: {str(e)}", level='error')
|
||||
raise
|
||||
|
||||
def _log(self, message, level='debug'):
|
||||
logger = current_app.logger
|
||||
log_method = getattr(logger, level)
|
||||
log_method(f"PDFProcessor - Tenant {self.tenant.id}, Document {self.document_version.id}: {message}")
|
||||
|
||||
def _extract_content(self, file_data):
|
||||
extracted_content = []
|
||||
with pdfplumber.open(io.BytesIO(file_data)) as pdf:
|
||||
@@ -248,24 +237,3 @@ class PDFProcessor:
|
||||
markdown_chunks.append(result)
|
||||
|
||||
return "\n\n".join(markdown_chunks)
|
||||
|
||||
def _save_markdown(self, markdown):
|
||||
markdown_filename = f"{self.document_version.id}.md"
|
||||
minio_client.upload_document_file(
|
||||
self.tenant.id,
|
||||
self.document_version.doc_id,
|
||||
self.document_version.language,
|
||||
self.document_version.id,
|
||||
markdown_filename,
|
||||
markdown.encode('utf-8')
|
||||
)
|
||||
|
||||
def _save_intermediate(self, content, filename):
|
||||
minio_client.upload_document_file(
|
||||
self.tenant.id,
|
||||
self.document_version.doc_id,
|
||||
self.document_version.language,
|
||||
self.document_version.id,
|
||||
filename,
|
||||
content.encode('utf-8')
|
||||
)
|
||||
42
eveai_workers/Processors/processor.py
Normal file
42
eveai_workers/Processors/processor.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from flask import current_app
|
||||
from common.extensions import minio_client
|
||||
|
||||
|
||||
class Processor(ABC):
|
||||
def __init__(self, tenant, model_variables, document_version):
|
||||
self.tenant = tenant
|
||||
self.model_variables = model_variables
|
||||
self.document_version = document_version
|
||||
self.embed_tuning = model_variables['embed_tuning']
|
||||
|
||||
@abstractmethod
|
||||
def process(self):
|
||||
pass
|
||||
|
||||
def _save_markdown(self, markdown):
|
||||
markdown_filename = f"{self.document_version.id}.md"
|
||||
minio_client.upload_document_file(
|
||||
self.tenant.id,
|
||||
self.document_version.doc_id,
|
||||
self.document_version.language,
|
||||
self.document_version.id,
|
||||
markdown_filename,
|
||||
markdown.encode('utf-8')
|
||||
)
|
||||
|
||||
def _log(self, message, level='debug'):
|
||||
logger = current_app.logger
|
||||
log_method = getattr(logger, level)
|
||||
log_method(
|
||||
f"{self.__class__.__name__} - Tenant {self.tenant.id}, Document {self.document_version.id}: {message}")
|
||||
|
||||
def _save_intermediate(self, content, filename):
|
||||
minio_client.upload_document_file(
|
||||
self.tenant.id,
|
||||
self.document_version.doc_id,
|
||||
self.document_version.language,
|
||||
self.document_version.id,
|
||||
filename,
|
||||
content.encode('utf-8')
|
||||
)
|
||||
80
eveai_workers/Processors/srt_processor.py
Normal file
80
eveai_workers/Processors/srt_processor.py
Normal file
@@ -0,0 +1,80 @@
|
||||
import re
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.prompts import ChatPromptTemplate
|
||||
from langchain_core.runnables import RunnablePassthrough
|
||||
from common.extensions import minio_client
|
||||
from common.utils.model_utils import create_language_template
|
||||
from .processor import Processor
|
||||
|
||||
|
||||
class SRTProcessor(Processor):
|
||||
def __init__(self, tenant, model_variables, document_version):
|
||||
super().__init__(tenant, model_variables, document_version)
|
||||
|
||||
def process(self):
|
||||
self._log("Starting SRT processing")
|
||||
try:
|
||||
file_data = minio_client.download_document_file(
|
||||
self.tenant.id,
|
||||
self.document_version.doc_id,
|
||||
self.document_version.language,
|
||||
self.document_version.id,
|
||||
self.document_version.file_name
|
||||
)
|
||||
|
||||
srt_content = file_data.decode('utf-8')
|
||||
cleaned_transcription = self._clean_srt(srt_content)
|
||||
markdown, title = self._generate_markdown_from_transcription(cleaned_transcription)
|
||||
|
||||
self._save_markdown(markdown)
|
||||
self._log("Finished processing SRT")
|
||||
return markdown, title
|
||||
except Exception as e:
|
||||
self._log(f"Error processing SRT: {str(e)}", level='error')
|
||||
raise
|
||||
|
||||
def _clean_srt(self, srt_content):
|
||||
# Remove timecodes and subtitle numbers
|
||||
cleaned_lines = []
|
||||
for line in srt_content.split('\n'):
|
||||
# Skip empty lines, subtitle numbers, and timecodes
|
||||
if line.strip() and not line.strip().isdigit() and not re.match(
|
||||
r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', line):
|
||||
cleaned_lines.append(line.strip())
|
||||
|
||||
# Join the cleaned lines
|
||||
cleaned_text = ' '.join(cleaned_lines)
|
||||
|
||||
# Remove any extra spaces
|
||||
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
|
||||
|
||||
return cleaned_text
|
||||
|
||||
def _generate_markdown_from_transcription(self, transcription):
|
||||
self._log("Generating markdown from transcription")
|
||||
llm = self.model_variables['llm']
|
||||
template = self.model_variables['transcript_template']
|
||||
language_template = create_language_template(template, self.document_version.language)
|
||||
transcript_prompt = ChatPromptTemplate.from_template(language_template)
|
||||
setup = RunnablePassthrough()
|
||||
output_parser = StrOutputParser()
|
||||
|
||||
chain = setup | transcript_prompt | llm | output_parser
|
||||
|
||||
input_transcript = {'transcript': transcription}
|
||||
markdown = chain.invoke(input_transcript)
|
||||
|
||||
# Extract title from the markdown
|
||||
title = self._extract_title_from_markdown(markdown)
|
||||
|
||||
return markdown, title
|
||||
|
||||
def _extract_title_from_markdown(self, markdown):
|
||||
# Simple extraction of the first header as the title
|
||||
lines = markdown.split('\n')
|
||||
for line in lines:
|
||||
if line.startswith('# '):
|
||||
return line[2:].strip()
|
||||
return "Untitled SRT Transcription"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user