- Improvements on document uploads (accept other files than html-files when entering a URL)

- Introduction of API-functionality (to be continued). Deduplication of document and url uploads between views and api.
- Improvements on document processing - introduction of processor classes to streamline document inputs
- Removed pure Youtube functionality, as Youtube retrieval of documents continuously changes. But added upload of srt, mp3, ogg and mp4
This commit is contained in:
Josako
2024-09-02 12:37:44 +02:00
parent a158655247
commit 914c265afe
21 changed files with 1425 additions and 852 deletions

View File

@@ -0,0 +1,187 @@
import io
import os
from pydub import AudioSegment
import tempfile
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from common.extensions import minio_client
from common.utils.model_utils import create_language_template
from .processor import Processor
import subprocess
class AudioProcessor(Processor):
def __init__(self, tenant, model_variables, document_version):
super().__init__(tenant, model_variables, document_version)
self.transcription_client = model_variables['transcription_client']
self.transcription_model = model_variables['transcription_model']
self.ffmpeg_path = 'ffmpeg'
def process(self):
self._log("Starting Audio processing")
try:
file_data = minio_client.download_document_file(
self.tenant.id,
self.document_version.doc_id,
self.document_version.language,
self.document_version.id,
self.document_version.file_name
)
compressed_audio = self._compress_audio(file_data)
transcription = self._transcribe_audio(compressed_audio)
markdown, title = self._generate_markdown_from_transcription(transcription)
self._save_markdown(markdown)
self._log("Finished processing Audio")
return markdown, title
except Exception as e:
self._log(f"Error processing Audio: {str(e)}", level='error')
raise
def _compress_audio(self, audio_data):
self._log("Compressing audio")
with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{self.document_version.file_type}') as temp_input:
temp_input.write(audio_data)
temp_input.flush()
# Use a unique filename for the output to avoid conflicts
output_filename = f'compressed_{os.urandom(8).hex()}.mp3'
output_path = os.path.join(tempfile.gettempdir(), output_filename)
try:
result = subprocess.run(
[self.ffmpeg_path, '-y', '-i', temp_input.name, '-b:a', '64k', '-f', 'mp3', output_path],
capture_output=True,
text=True,
check=True
)
with open(output_path, 'rb') as f:
compressed_data = f.read()
# Save compressed audio to MinIO
compressed_filename = f"{self.document_version.id}_compressed.mp3"
minio_client.upload_document_file(
self.tenant.id,
self.document_version.doc_id,
self.document_version.language,
self.document_version.id,
compressed_filename,
compressed_data
)
self._log(f"Saved compressed audio to MinIO: {compressed_filename}")
return compressed_data
except subprocess.CalledProcessError as e:
error_message = f"Compression failed: {e.stderr}"
self._log(error_message, level='error')
raise Exception(error_message)
finally:
# Clean up temporary files
os.unlink(temp_input.name)
if os.path.exists(output_path):
os.unlink(output_path)
def _transcribe_audio(self, audio_data):
self._log("Starting audio transcription")
audio = AudioSegment.from_file(io.BytesIO(audio_data), format="mp3")
segment_length = 10 * 60 * 1000 # 10 minutes in milliseconds
transcriptions = []
for i, chunk in enumerate(audio[::segment_length]):
self._log(f'Processing chunk {i + 1} of {len(audio) // segment_length + 1}')
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
chunk.export(temp_audio.name, format="mp3")
temp_audio.flush()
try:
file_size = os.path.getsize(temp_audio.name)
self._log(f"Temporary audio file size: {file_size} bytes")
with open(temp_audio.name, 'rb') as audio_file:
file_start = audio_file.read(100)
self._log(f"First 100 bytes of audio file: {file_start}")
audio_file.seek(0) # Reset file pointer to the beginning
self._log("Calling transcription API")
transcription = self.transcription_client.audio.transcriptions.create(
file=audio_file,
model=self.transcription_model,
language=self.document_version.language,
response_format='verbose_json',
)
self._log("Transcription API call completed")
if transcription:
# Handle the transcription result based on its type
if isinstance(transcription, str):
self._log(f"Transcription result (string): {transcription[:100]}...")
transcriptions.append(transcription)
elif hasattr(transcription, 'text'):
self._log(
f"Transcription result (object with 'text' attribute): {transcription.text[:100]}...")
transcriptions.append(transcription.text)
else:
self._log(f"Transcription result (unknown type): {str(transcription)[:100]}...")
transcriptions.append(str(transcription))
else:
self._log("Warning: Received empty transcription", level='warning')
except Exception as e:
self._log(f"Error during transcription: {str(e)}", level='error')
finally:
os.unlink(temp_audio.name)
full_transcription = " ".join(filter(None, transcriptions))
if not full_transcription:
self._log("Warning: No transcription was generated", level='warning')
full_transcription = "No transcription available."
# Save transcription to MinIO
transcription_filename = f"{self.document_version.id}_transcription.txt"
minio_client.upload_document_file(
self.tenant.id,
self.document_version.doc_id,
self.document_version.language,
self.document_version.id,
transcription_filename,
full_transcription.encode('utf-8')
)
self._log(f"Saved transcription to MinIO: {transcription_filename}")
return full_transcription
def _generate_markdown_from_transcription(self, transcription):
self._log("Generating markdown from transcription")
llm = self.model_variables['llm']
template = self.model_variables['transcript_template']
language_template = create_language_template(template, self.document_version.language)
transcript_prompt = ChatPromptTemplate.from_template(language_template)
setup = RunnablePassthrough()
output_parser = StrOutputParser()
chain = setup | transcript_prompt | llm | output_parser
input_transcript = {'transcript': transcription}
markdown = chain.invoke(input_transcript)
# Extract title from the markdown
title = self._extract_title_from_markdown(markdown)
return markdown, title
def _extract_title_from_markdown(self, markdown):
# Simple extraction of the first header as the title
lines = markdown.split('\n')
for line in lines:
if line.startswith('# '):
return line[2:].strip()
return "Untitled Audio Transcription"

View File

@@ -0,0 +1,142 @@
from bs4 import BeautifulSoup
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from common.extensions import db, minio_client
from common.utils.model_utils import create_language_template
from .processor import Processor
class HTMLProcessor(Processor):
def __init__(self, tenant, model_variables, document_version):
super().__init__(tenant, model_variables, document_version)
self.html_tags = model_variables['html_tags']
self.html_end_tags = model_variables['html_end_tags']
self.html_included_elements = model_variables['html_included_elements']
self.html_excluded_elements = model_variables['html_excluded_elements']
def process(self):
self._log("Starting HTML processing")
try:
file_data = minio_client.download_document_file(
self.tenant.id,
self.document_version.doc_id,
self.document_version.language,
self.document_version.id,
self.document_version.file_name
)
html_content = file_data.decode('utf-8')
extracted_html, title = self._parse_html(html_content)
markdown = self._generate_markdown_from_html(extracted_html)
self._save_markdown(markdown)
self._log("Finished processing HTML")
return markdown, title
except Exception as e:
self._log(f"Error processing HTML: {str(e)}", level='error')
raise
def _parse_html(self, html_content):
self._log(f'Parsing HTML for tenant {self.tenant.id}')
soup = BeautifulSoup(html_content, 'html.parser')
extracted_html = ''
excluded_classes = self._parse_excluded_classes(self.tenant.html_excluded_classes)
if self.html_included_elements:
elements_to_parse = soup.find_all(self.html_included_elements)
else:
elements_to_parse = [soup]
for element in elements_to_parse:
for sub_element in element.find_all(self.html_tags):
if self._should_exclude_element(sub_element, excluded_classes):
continue
extracted_html += self._extract_element_content(sub_element)
title = soup.find('title').get_text(strip=True) if soup.find('title') else ''
self._log(f'Finished parsing HTML for tenant {self.tenant.id}')
return extracted_html, title
def _generate_markdown_from_html(self, html_content):
self._log(f'Generating markdown from HTML for tenant {self.tenant.id}')
llm = self.model_variables['llm']
template = self.model_variables['html_parse_template']
parse_prompt = ChatPromptTemplate.from_template(template)
setup = RunnablePassthrough()
output_parser = StrOutputParser()
chain = setup | parse_prompt | llm | output_parser
soup = BeautifulSoup(html_content, 'lxml')
chunks = self._split_content(soup)
markdown_chunks = []
for chunk in chunks:
if self.embed_tuning:
self._log(f'Processing chunk: \n{chunk}\n')
input_html = {"html": chunk}
markdown_chunk = chain.invoke(input_html)
markdown_chunks.append(markdown_chunk)
if self.embed_tuning:
self._log(f'Processed markdown chunk: \n{markdown_chunk}\n')
markdown = "\n\n".join(markdown_chunks)
self._log(f'Finished generating markdown from HTML for tenant {self.tenant.id}')
return markdown
def _split_content(self, soup, max_size=20000):
chunks = []
current_chunk = []
current_size = 0
for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span', 'table']):
element_html = str(element)
element_size = len(element_html)
if current_size + element_size > max_size and current_chunk:
chunks.append(''.join(map(str, current_chunk)))
current_chunk = []
current_size = 0
current_chunk.append(element)
current_size += element_size
if element.name in ['h1', 'h2', 'h3'] and current_size > max_size:
chunks.append(''.join(map(str, current_chunk)))
current_chunk = []
current_size = 0
if current_chunk:
chunks.append(''.join(map(str, current_chunk)))
return chunks
def _parse_excluded_classes(self, excluded_classes):
parsed = {}
for rule in excluded_classes:
element, cls = rule.split('.', 1)
parsed.setdefault(element, set()).add(cls)
return parsed
def _should_exclude_element(self, element, excluded_classes):
if self.html_excluded_elements and element.find_parent(self.html_excluded_elements):
return True
return self._is_element_excluded_by_class(element, excluded_classes)
def _is_element_excluded_by_class(self, element, excluded_classes):
for parent in element.parents:
if self._element_matches_exclusion(parent, excluded_classes):
return True
return self._element_matches_exclusion(element, excluded_classes)
def _element_matches_exclusion(self, element, excluded_classes):
if '*' in excluded_classes and any(cls in excluded_classes['*'] for cls in element.get('class', [])):
return True
return element.name in excluded_classes and \
any(cls in excluded_classes[element.name] for cls in element.get('class', []))
def _extract_element_content(self, element):
content = ' '.join(child.strip() for child in element.stripped_strings)
return f'<{element.name}>{content}</{element.name}>\n'

View File

@@ -5,29 +5,23 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
import re
from langchain_core.runnables import RunnablePassthrough
from common.extensions import minio_client
from common.utils.model_utils import create_language_template
from .processor import Processor
class PDFProcessor:
class PDFProcessor(Processor):
def __init__(self, tenant, model_variables, document_version):
self.tenant = tenant
self.model_variables = model_variables
self.document_version = document_version
# Configuration parameters from model_variables
super().__init__(tenant, model_variables, document_version)
# PDF-specific initialization
self.chunk_size = model_variables['PDF_chunk_size']
self.chunk_overlap = model_variables['PDF_chunk_overlap']
self.min_chunk_size = model_variables['PDF_min_chunk_size']
self.max_chunk_size = model_variables['PDF_max_chunk_size']
# Set tuning variable for easy use
self.embed_tuning = model_variables['embed_tuning']
def process_pdf(self):
def process(self):
self._log("Starting PDF processing")
try:
file_data = minio_client.download_document_file(
@@ -51,11 +45,6 @@ class PDFProcessor:
self._log(f"Error processing PDF: {str(e)}", level='error')
raise
def _log(self, message, level='debug'):
logger = current_app.logger
log_method = getattr(logger, level)
log_method(f"PDFProcessor - Tenant {self.tenant.id}, Document {self.document_version.id}: {message}")
def _extract_content(self, file_data):
extracted_content = []
with pdfplumber.open(io.BytesIO(file_data)) as pdf:
@@ -248,24 +237,3 @@ class PDFProcessor:
markdown_chunks.append(result)
return "\n\n".join(markdown_chunks)
def _save_markdown(self, markdown):
markdown_filename = f"{self.document_version.id}.md"
minio_client.upload_document_file(
self.tenant.id,
self.document_version.doc_id,
self.document_version.language,
self.document_version.id,
markdown_filename,
markdown.encode('utf-8')
)
def _save_intermediate(self, content, filename):
minio_client.upload_document_file(
self.tenant.id,
self.document_version.doc_id,
self.document_version.language,
self.document_version.id,
filename,
content.encode('utf-8')
)

View File

@@ -0,0 +1,42 @@
from abc import ABC, abstractmethod
from flask import current_app
from common.extensions import minio_client
class Processor(ABC):
def __init__(self, tenant, model_variables, document_version):
self.tenant = tenant
self.model_variables = model_variables
self.document_version = document_version
self.embed_tuning = model_variables['embed_tuning']
@abstractmethod
def process(self):
pass
def _save_markdown(self, markdown):
markdown_filename = f"{self.document_version.id}.md"
minio_client.upload_document_file(
self.tenant.id,
self.document_version.doc_id,
self.document_version.language,
self.document_version.id,
markdown_filename,
markdown.encode('utf-8')
)
def _log(self, message, level='debug'):
logger = current_app.logger
log_method = getattr(logger, level)
log_method(
f"{self.__class__.__name__} - Tenant {self.tenant.id}, Document {self.document_version.id}: {message}")
def _save_intermediate(self, content, filename):
minio_client.upload_document_file(
self.tenant.id,
self.document_version.doc_id,
self.document_version.language,
self.document_version.id,
filename,
content.encode('utf-8')
)

View File

@@ -0,0 +1,80 @@
import re
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from common.extensions import minio_client
from common.utils.model_utils import create_language_template
from .processor import Processor
class SRTProcessor(Processor):
def __init__(self, tenant, model_variables, document_version):
super().__init__(tenant, model_variables, document_version)
def process(self):
self._log("Starting SRT processing")
try:
file_data = minio_client.download_document_file(
self.tenant.id,
self.document_version.doc_id,
self.document_version.language,
self.document_version.id,
self.document_version.file_name
)
srt_content = file_data.decode('utf-8')
cleaned_transcription = self._clean_srt(srt_content)
markdown, title = self._generate_markdown_from_transcription(cleaned_transcription)
self._save_markdown(markdown)
self._log("Finished processing SRT")
return markdown, title
except Exception as e:
self._log(f"Error processing SRT: {str(e)}", level='error')
raise
def _clean_srt(self, srt_content):
# Remove timecodes and subtitle numbers
cleaned_lines = []
for line in srt_content.split('\n'):
# Skip empty lines, subtitle numbers, and timecodes
if line.strip() and not line.strip().isdigit() and not re.match(
r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', line):
cleaned_lines.append(line.strip())
# Join the cleaned lines
cleaned_text = ' '.join(cleaned_lines)
# Remove any extra spaces
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
return cleaned_text
def _generate_markdown_from_transcription(self, transcription):
self._log("Generating markdown from transcription")
llm = self.model_variables['llm']
template = self.model_variables['transcript_template']
language_template = create_language_template(template, self.document_version.language)
transcript_prompt = ChatPromptTemplate.from_template(language_template)
setup = RunnablePassthrough()
output_parser = StrOutputParser()
chain = setup | transcript_prompt | llm | output_parser
input_transcript = {'transcript': transcription}
markdown = chain.invoke(input_transcript)
# Extract title from the markdown
title = self._extract_title_from_markdown(markdown)
return markdown, title
def _extract_title_from_markdown(self, markdown):
# Simple extraction of the first header as the title
lines = markdown.split('\n')
for line in lines:
if line.startswith('# '):
return line[2:].strip()
return "Untitled SRT Transcription"

View File

@@ -1,26 +1,16 @@
import io
import os
from datetime import datetime as dt, timezone as tz
import subprocess
import gevent
from bs4 import BeautifulSoup
import html
from celery import states
from flask import current_app
# OpenAI imports
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import CharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain_core.documents import Document
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain_core.exceptions import LangChainException
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from sqlalchemy.exc import SQLAlchemyError
from pytube import YouTube
import PyPDF2
from pydub import AudioSegment
import tempfile
from common.extensions import db, minio_client
from common.models.document import DocumentVersion, Embedding
@@ -29,7 +19,10 @@ from common.utils.celery_utils import current_celery
from common.utils.database import Database
from common.utils.model_utils import select_model_variables, create_language_template
from common.utils.os_utils import safe_remove, sync_folder
from eveai_workers.Processors.PDF_Processor import PDFProcessor
from eveai_workers.Processors.audio_processor import AudioProcessor
from eveai_workers.Processors.html_processor import HTMLProcessor
from eveai_workers.Processors.pdf_processor import PDFProcessor
from eveai_workers.Processors.srt_processor import SRTProcessor
@current_celery.task(name='create_embeddings', queue='embeddings')
@@ -85,8 +78,10 @@ def create_embeddings(tenant_id, document_version_id):
process_pdf(tenant, model_variables, document_version)
case 'html':
process_html(tenant, model_variables, document_version)
case 'youtube':
process_youtube(tenant, model_variables, document_version)
case 'srt':
process_srt(tenant, model_variables, document_version)
case 'mp4' | 'mp3' | 'ogg':
process_audio(tenant, model_variables, document_version)
case _:
raise Exception(f'No functionality defined for file type {document_version.file_type} '
f'for tenant {tenant_id} '
@@ -104,83 +99,6 @@ def create_embeddings(tenant_id, document_version_id):
raise
# def process_pdf(tenant, model_variables, document_version):
# file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language,
# document_version.id, document_version.file_name)
#
# pdf_text = ''
# pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_data))
# for page in pdf_reader.pages:
# pdf_text += page.extract_text()
#
# markdown = generate_markdown_from_pdf(tenant, model_variables, document_version, pdf_text)
# markdown_file_name = f'{document_version.id}.md'
# minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language,
# document_version.id,
# markdown_file_name, markdown.encode())
#
# potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
# chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
# model_variables['max_chunk_size'])
#
# if len(chunks) > 1:
# summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
# document_version.system_context = f'Summary: {summary}\n'
# else:
# document_version.system_context = ''
#
# enriched_chunks = enrich_chunks(tenant, document_version, chunks)
# embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
#
# try:
# db.session.add(document_version)
# document_version.processing_finished_at = dt.now(tz.utc)
# document_version.processing = False
# db.session.add_all(embeddings)
# db.session.commit()
# except SQLAlchemyError as e:
# current_app.logger.error(f'Error saving embedding information for tenant {tenant.id} '
# f'on HTML, document version {document_version.id}'
# f'error: {e}')
# raise
#
# current_app.logger.info(f'Embeddings created successfully for tenant {tenant.id} '
# f'on document version {document_version.id} :-)')
def process_pdf(tenant, model_variables, document_version):
processor = PDFProcessor(tenant, model_variables, document_version)
markdown, title = processor.process_pdf()
# Create potential chunks for embedding
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, f"{document_version.id}.md")
# Combine chunks for embedding
chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
model_variables['max_chunk_size'])
# Enrich chunks
enriched_chunks = enrich_chunks(tenant, document_version, title, chunks)
# Create embeddings
embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
# Update document version and save embeddings
try:
db.session.add(document_version)
document_version.processing_finished_at = dt.now(tz.utc)
document_version.processing = False
db.session.add_all(embeddings)
db.session.commit()
except SQLAlchemyError as e:
current_app.logger.error(f'Error saving embedding information for tenant {tenant.id} '
f'on PDF, document version {document_version.id}'
f'error: {e}')
raise
current_app.logger.info(f'Embeddings created successfully for tenant {tenant.id} '
f'on document version {document_version.id} :-)')
def delete_embeddings_for_document_version(document_version):
embeddings_to_delete = db.session.query(Embedding).filter_by(doc_vers_id=document_version.id).all()
for embedding in embeddings_to_delete:
@@ -193,45 +111,53 @@ def delete_embeddings_for_document_version(document_version):
raise
def process_pdf(tenant, model_variables, document_version):
processor = PDFProcessor(tenant, model_variables, document_version)
markdown, title = processor.process()
# Process markdown and embed
embed_markdown(tenant, model_variables, document_version, markdown, title)
def process_html(tenant, model_variables, document_version):
file_data = minio_client.download_document_file(tenant.id, document_version.doc_id, document_version.language,
document_version.id, document_version.file_name)
html_content = file_data.decode('utf-8')
processor = HTMLProcessor(tenant, model_variables, document_version)
markdown, title = processor.process()
# The tags to be considered can be dependent on the tenant
html_tags = model_variables['html_tags']
html_end_tags = model_variables['html_end_tags']
html_included_elements = model_variables['html_included_elements']
html_excluded_elements = model_variables['html_excluded_elements']
# Process markdown and embed
embed_markdown(tenant, model_variables, document_version, markdown, title)
extracted_html, title = parse_html(tenant, html_content, html_tags, included_elements=html_included_elements,
excluded_elements=html_excluded_elements)
extracted_file_name = f'{document_version.id}-extracted.html'
minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language,
document_version.id,
extracted_file_name, extracted_html.encode())
def process_audio(tenant, model_variables, document_version):
processor = AudioProcessor(tenant, model_variables, document_version)
markdown, title = processor.process()
markdown = generate_markdown_from_html(tenant, model_variables, document_version, extracted_html)
markdown_file_name = f'{document_version.id}.md'
minio_client.upload_document_file(tenant.id, document_version.doc_id, document_version.language,
document_version.id,
markdown_file_name, markdown.encode())
# Process markdown and embed
embed_markdown(tenant, model_variables, document_version, markdown, title)
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
def process_srt(tenant, model_variables, document_version):
processor = SRTProcessor(tenant, model_variables, document_version)
markdown, title = processor.process()
# Process markdown and embed
embed_markdown(tenant, model_variables, document_version, markdown, title)
def embed_markdown(tenant, model_variables, document_version, markdown, title):
# Create potential chunks
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, f"{document_version.id}.md")
# Combine chunks for embedding
chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
model_variables['max_chunk_size'])
if len(chunks) > 1:
summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
document_version.system_context = (f'Title: {title}\n'
f'Summary: {summary}\n')
else:
document_version.system_context = (f'Title: {title}\n')
# Enrich chunks
enriched_chunks = enrich_chunks(tenant, model_variables, document_version, title, chunks)
enriched_chunks = enrich_chunks(tenant, document_version, title, chunks)
# Create embeddings
embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
# Update document version and save embeddings
try:
db.session.add(document_version)
document_version.processing_finished_at = dt.now(tz.utc)
@@ -248,12 +174,18 @@ def process_html(tenant, model_variables, document_version):
f'on document version {document_version.id} :-)')
def enrich_chunks(tenant, document_version, title, chunks):
def enrich_chunks(tenant, model_variables, document_version, title, chunks):
current_app.logger.debug(f'Enriching chunks for tenant {tenant.id} '
f'on document version {document_version.id}')
current_app.logger.debug(f'Nr of chunks: {len(chunks)}')
summary = ''
if len(chunks) > 1:
summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
chunk_total_context = (f'Filename: {document_version.file_name}\n'
f'User Context:\n{document_version.user_context}\n\n'
f'Title: {title}\n'
f'{summary}\n'
f'{document_version.system_context}\n\n')
enriched_chunks = []
initial_chunk = (f'Filename: {document_version.file_name}\n'
@@ -272,95 +204,6 @@ def enrich_chunks(tenant, document_version, title, chunks):
return enriched_chunks
def generate_markdown_from_html(tenant, model_variables, document_version, html_content):
current_app.logger.debug(f'Generating markdown from HTML for tenant {tenant.id} '
f'on document version {document_version.id}')
llm = model_variables['llm']
template = model_variables['html_parse_template']
parse_prompt = ChatPromptTemplate.from_template(template)
setup = RunnablePassthrough()
output_parser = StrOutputParser()
chain = setup | parse_prompt | llm | output_parser
soup = BeautifulSoup(html_content, 'lxml')
def split_content(soup, max_size=20000):
chunks = []
current_chunk = []
current_size = 0
for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span', 'table']):
element_html = str(element)
element_size = len(element_html)
if current_size + element_size > max_size and current_chunk:
chunks.append(''.join(map(str, current_chunk)))
current_chunk = []
current_size = 0
current_chunk.append(element)
current_size += element_size
if element.name in ['h1', 'h2', 'h3'] and current_size > max_size:
chunks.append(''.join(map(str, current_chunk)))
current_chunk = []
current_size = 0
if current_chunk:
chunks.append(''.join(map(str, current_chunk)))
return chunks
chunks = split_content(soup)
markdown_chunks = []
for chunk in chunks:
current_app.logger.debug(f'Processing chunk to generate markdown from HTML for tenant {tenant.id} '
f'on document version {document_version.id}')
if tenant.embed_tuning:
current_app.embed_tuning_logger.debug(f'Processing chunk: \n '
f'------------------\n'
f'{chunk}\n'
f'------------------\n')
input_html = {"html": chunk}
markdown_chunk = chain.invoke(input_html)
markdown_chunks.append(markdown_chunk)
if tenant.embed_tuning:
current_app.embed_tuning_logger.debug(f'Processed markdown chunk: \n '
f'-------------------------\n'
f'{markdown_chunk}\n'
f'-------------------------\n')
current_app.logger.debug(f'Finished processing chunk to generate markdown from HTML for tenant {tenant.id} '
f'on document version {document_version.id}')
# Combine all markdown chunks
markdown = "\n\n".join(markdown_chunks)
current_app.logger.debug(f'Finished generating markdown from HTML for tenant {tenant.id} '
f'on document version {document_version.id}')
return markdown
def generate_markdown_from_pdf(tenant, model_variables, document_version, pdf_content):
current_app.logger.debug(f'Generating Markdown from PDF for tenant {tenant.id} '
f'on document version {document_version.id}')
llm = model_variables['llm']
template = model_variables['pdf_parse_template']
parse_prompt = ChatPromptTemplate.from_template(template)
setup = RunnablePassthrough()
output_parser = StrOutputParser()
chain = setup | parse_prompt | llm | output_parser
input_pdf = {"pdf_content": pdf_content}
markdown = chain.invoke(input_pdf)
return markdown
def summarize_chunk(tenant, model_variables, document_version, chunk):
current_app.logger.debug(f'Summarizing chunk for tenant {tenant.id} '
f'on document version {document_version.id}')
@@ -415,65 +258,6 @@ def embed_chunks(tenant, model_variables, document_version, chunks):
return new_embeddings
def parse_html(tenant, html_content, tags, included_elements=None, excluded_elements=None):
current_app.logger.debug(f'Parsing HTML for tenant {tenant.id}')
soup = BeautifulSoup(html_content, 'html.parser')
extracted_html = ''
excluded_classes = parse_excluded_classes(tenant.html_excluded_classes)
if included_elements:
elements_to_parse = soup.find_all(included_elements)
else:
elements_to_parse = [soup]
log_parsing_info(tenant, tags, included_elements, excluded_elements, excluded_classes, elements_to_parse)
for element in elements_to_parse:
for sub_element in element.find_all(tags):
if should_exclude_element(sub_element, excluded_elements, excluded_classes):
continue
extracted_html += extract_element_content(sub_element)
title = soup.find('title').get_text(strip=True) if soup.find('title') else ''
current_app.logger.debug(f'Finished parsing HTML for tenant {tenant.id}')
return extracted_html, title
def parse_excluded_classes(excluded_classes):
parsed = {}
for rule in excluded_classes:
element, cls = rule.split('.', 1)
parsed.setdefault(element, set()).add(cls)
return parsed
def should_exclude_element(element, excluded_elements, excluded_classes):
if excluded_elements and element.find_parent(excluded_elements):
return True
return is_element_excluded_by_class(element, excluded_classes)
def is_element_excluded_by_class(element, excluded_classes):
for parent in element.parents:
if element_matches_exclusion(parent, excluded_classes):
return True
return element_matches_exclusion(element, excluded_classes)
def element_matches_exclusion(element, excluded_classes):
if '*' in excluded_classes and any(cls in excluded_classes['*'] for cls in element.get('class', [])):
return True
return element.name in excluded_classes and \
any(cls in excluded_classes[element.name] for cls in element.get('class', []))
def extract_element_content(element):
content = ' '.join(child.strip() for child in element.stripped_strings)
return f'<{element.name}>{content}</{element.name}>\n'
def log_parsing_info(tenant, tags, included_elements, excluded_elements, excluded_classes, elements_to_parse):
if tenant.embed_tuning:
current_app.embed_tuning_logger.debug(f'Tags to parse: {tags}')
@@ -484,244 +268,242 @@ def log_parsing_info(tenant, tags, included_elements, excluded_elements, exclude
current_app.embed_tuning_logger.debug(f'First element to parse: {elements_to_parse[0]}')
def process_youtube(tenant, model_variables, document_version):
base_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
document_version.file_location)
download_file_name = f'{document_version.id}.mp4'
compressed_file_name = f'{document_version.id}.mp3'
transcription_file_name = f'{document_version.id}.txt'
markdown_file_name = f'{document_version.id}.md'
# Remove existing files (in case of a re-processing of the file
minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
document_version.id, download_file_name)
minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
document_version.id, compressed_file_name)
minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
document_version.id, transcription_file_name)
minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
document_version.id, markdown_file_name)
of, title, description, author = download_youtube(document_version.url, tenant.id, document_version,
download_file_name)
document_version.system_context = f'Title: {title}\nDescription: {description}\nAuthor: {author}'
compress_audio(tenant.id, document_version, download_file_name, compressed_file_name)
transcribe_audio(tenant.id, document_version, compressed_file_name, transcription_file_name, model_variables)
annotate_transcription(tenant, document_version, transcription_file_name, markdown_file_name, model_variables)
potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
actual_chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
model_variables['max_chunk_size'])
enriched_chunks = enrich_chunks(tenant, document_version, actual_chunks)
embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
try:
db.session.add(document_version)
document_version.processing_finished_at = dt.now(tz.utc)
document_version.processing = False
db.session.add_all(embeddings)
db.session.commit()
except SQLAlchemyError as e:
current_app.logger.error(f'Error saving embedding information for tenant {tenant.id} '
f'on Youtube document version {document_version.id}'
f'error: {e}')
raise
current_app.logger.info(f'Embeddings created successfully for tenant {tenant.id} '
f'on Youtube document version {document_version.id} :-)')
def download_youtube(url, tenant_id, document_version, file_name):
try:
current_app.logger.info(f'Downloading YouTube video: {url} for tenant: {tenant_id}')
yt = YouTube(url)
stream = yt.streams.get_audio_only()
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
stream.download(output_path=temp_file.name)
with open(temp_file.name, 'rb') as f:
file_data = f.read()
minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language,
document_version.id,
file_name, file_data)
current_app.logger.info(f'Downloaded YouTube video: {url} for tenant: {tenant_id}')
return file_name, yt.title, yt.description, yt.author
except Exception as e:
current_app.logger.error(f'Error downloading YouTube video: {url} for tenant: {tenant_id} with error: {e}')
raise
def compress_audio(tenant_id, document_version, input_file, output_file):
try:
current_app.logger.info(f'Compressing audio for tenant: {tenant_id}')
input_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language,
document_version.id, input_file)
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp_input:
temp_input.write(input_data)
temp_input.flush()
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as temp_output:
result = subprocess.run(
['ffmpeg', '-i', temp_input.name, '-b:a', '64k', '-f', 'mp3', temp_output.name],
capture_output=True,
text=True
)
if result.returncode != 0:
raise Exception(f"Compression failed: {result.stderr}")
with open(temp_output.name, 'rb') as f:
compressed_data = f.read()
minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language,
document_version.id,
output_file, compressed_data)
current_app.logger.info(f'Compressed audio for tenant: {tenant_id}')
except Exception as e:
current_app.logger.error(f'Error compressing audio for tenant: {tenant_id} with error: {e}')
raise
def transcribe_audio(tenant_id, document_version, input_file, output_file, model_variables):
try:
current_app.logger.info(f'Transcribing audio for tenant: {tenant_id}')
client = model_variables['transcription_client']
model = model_variables['transcription_model']
# Download the audio file from MinIO
audio_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language,
document_version.id, input_file)
# Load the audio data into pydub
audio = AudioSegment.from_mp3(io.BytesIO(audio_data))
# Define segment length (e.g., 10 minutes)
segment_length = 10 * 60 * 1000 # 10 minutes in milliseconds
transcriptions = []
# Split audio into segments and transcribe each
for i, chunk in enumerate(audio[::segment_length]):
current_app.logger.debug(f'Transcribing chunk {i + 1} of {len(audio) // segment_length + 1}')
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
chunk.export(temp_audio.name, format="mp3")
with open(temp_audio.name, 'rb') as audio_segment:
transcription = client.audio.transcriptions.create(
file=audio_segment,
model=model,
language=document_version.language,
response_format='verbose_json',
)
transcriptions.append(transcription.text)
os.unlink(temp_audio.name) # Delete the temporary file
# Combine all transcriptions
full_transcription = " ".join(transcriptions)
# Upload the full transcription to MinIO
minio_client.upload_document_file(
tenant_id,
document_version.doc_id,
document_version.language,
document_version.id,
output_file,
full_transcription.encode('utf-8')
)
current_app.logger.info(f'Transcribed audio for tenant: {tenant_id}')
except Exception as e:
current_app.logger.error(f'Error transcribing audio for tenant: {tenant_id}, with error: {e}')
raise
def annotate_transcription(tenant, document_version, input_file, output_file, model_variables):
try:
current_app.logger.debug(f'Annotating transcription for tenant {tenant.id}')
char_splitter = CharacterTextSplitter(separator='.',
chunk_size=model_variables['annotation_chunk_length'],
chunk_overlap=0)
headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
llm = model_variables['llm']
template = model_variables['transcript_template']
language_template = create_language_template(template, document_version.language)
transcript_prompt = ChatPromptTemplate.from_template(language_template)
setup = RunnablePassthrough()
output_parser = StrOutputParser()
# Download the transcription file from MinIO
transcript_data = minio_client.download_document_file(tenant.id, document_version.doc_id,
document_version.language, document_version.id,
input_file)
transcript = transcript_data.decode('utf-8')
chain = setup | transcript_prompt | llm | output_parser
chunks = char_splitter.split_text(transcript)
all_markdown_chunks = []
last_markdown_chunk = ''
for chunk in chunks:
current_app.logger.debug(f'Annotating next chunk of {len(chunks)} for tenant {tenant.id}')
full_input = last_markdown_chunk + '\n' + chunk
if tenant.embed_tuning:
current_app.embed_tuning_logger.debug(f'Annotating chunk: \n '
f'------------------\n'
f'{full_input}\n'
f'------------------\n')
input_transcript = {'transcript': full_input}
markdown = chain.invoke(input_transcript)
# GPT-4o returns some kind of content description: ```markdown <text> ```
if markdown.startswith("```markdown"):
markdown = "\n".join(markdown.strip().split("\n")[1:-1])
if tenant.embed_tuning:
current_app.embed_tuning_logger.debug(f'Markdown Received: \n '
f'------------------\n'
f'{markdown}\n'
f'------------------\n')
md_header_splits = markdown_splitter.split_text(markdown)
markdown_chunks = [doc.page_content for doc in md_header_splits]
# claude-3.5-sonnet returns introductory text
if not markdown_chunks[0].startswith('#'):
markdown_chunks.pop(0)
last_markdown_chunk = markdown_chunks[-1]
last_markdown_chunk = "\n".join(markdown.strip().split("\n")[1:])
markdown_chunks.pop()
all_markdown_chunks += markdown_chunks
all_markdown_chunks += [last_markdown_chunk]
annotated_transcript = '\n'.join(all_markdown_chunks)
# Upload the annotated transcript to MinIO
minio_client.upload_document_file(
tenant.id,
document_version.doc_id,
document_version.language,
document_version.id,
output_file,
annotated_transcript.encode('utf-8')
)
current_app.logger.info(f'Annotated transcription for tenant {tenant.id}')
except Exception as e:
current_app.logger.error(f'Error annotating transcription for tenant {tenant.id}, with error: {e}')
raise
# def process_youtube(tenant, model_variables, document_version):
# download_file_name = f'{document_version.id}.mp4'
# compressed_file_name = f'{document_version.id}.mp3'
# transcription_file_name = f'{document_version.id}.txt'
# markdown_file_name = f'{document_version.id}.md'
#
# # Remove existing files (in case of a re-processing of the file
# minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
# document_version.id, download_file_name)
# minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
# document_version.id, compressed_file_name)
# minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
# document_version.id, transcription_file_name)
# minio_client.delete_document_file(tenant.id, document_version.doc_id, document_version.language,
# document_version.id, markdown_file_name)
#
# of, title, description, author = download_youtube(document_version.url, tenant.id, document_version,
# download_file_name)
# document_version.system_context = f'Title: {title}\nDescription: {description}\nAuthor: {author}'
# compress_audio(tenant.id, document_version, download_file_name, compressed_file_name)
# transcribe_audio(tenant.id, document_version, compressed_file_name, transcription_file_name, model_variables)
# annotate_transcription(tenant, document_version, transcription_file_name, markdown_file_name, model_variables)
#
# potential_chunks = create_potential_chunks_for_markdown(tenant.id, document_version, markdown_file_name)
# actual_chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
# model_variables['max_chunk_size'])
#
# enriched_chunks = enrich_chunks(tenant, document_version, actual_chunks)
# embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
#
# try:
# db.session.add(document_version)
# document_version.processing_finished_at = dt.now(tz.utc)
# document_version.processing = False
# db.session.add_all(embeddings)
# db.session.commit()
# except SQLAlchemyError as e:
# current_app.logger.error(f'Error saving embedding information for tenant {tenant.id} '
# f'on Youtube document version {document_version.id}'
# f'error: {e}')
# raise
#
# current_app.logger.info(f'Embeddings created successfully for tenant {tenant.id} '
# f'on Youtube document version {document_version.id} :-)')
#
#
# def download_youtube(url, tenant_id, document_version, file_name):
# try:
# current_app.logger.info(f'Downloading YouTube video: {url} for tenant: {tenant_id}')
# yt = YouTube(url)
# stream = yt.streams.get_audio_only()
#
# with tempfile.NamedTemporaryFile(delete=False) as temp_file:
# stream.download(output_path=temp_file.name)
# with open(temp_file.name, 'rb') as f:
# file_data = f.read()
#
# minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language,
# document_version.id,
# file_name, file_data)
#
# current_app.logger.info(f'Downloaded YouTube video: {url} for tenant: {tenant_id}')
# return file_name, yt.title, yt.description, yt.author
# except Exception as e:
# current_app.logger.error(f'Error downloading YouTube video: {url} for tenant: {tenant_id} with error: {e}')
# raise
#
#
# def compress_audio(tenant_id, document_version, input_file, output_file):
# try:
# current_app.logger.info(f'Compressing audio for tenant: {tenant_id}')
#
# input_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language,
# document_version.id, input_file)
#
# with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp_input:
# temp_input.write(input_data)
# temp_input.flush()
#
# with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as temp_output:
# result = subprocess.run(
# ['ffmpeg', '-i', temp_input.name, '-b:a', '64k', '-f', 'mp3', temp_output.name],
# capture_output=True,
# text=True
# )
#
# if result.returncode != 0:
# raise Exception(f"Compression failed: {result.stderr}")
#
# with open(temp_output.name, 'rb') as f:
# compressed_data = f.read()
#
# minio_client.upload_document_file(tenant_id, document_version.doc_id, document_version.language,
# document_version.id,
# output_file, compressed_data)
#
# current_app.logger.info(f'Compressed audio for tenant: {tenant_id}')
# except Exception as e:
# current_app.logger.error(f'Error compressing audio for tenant: {tenant_id} with error: {e}')
# raise
#
#
# def transcribe_audio(tenant_id, document_version, input_file, output_file, model_variables):
# try:
# current_app.logger.info(f'Transcribing audio for tenant: {tenant_id}')
# client = model_variables['transcription_client']
# model = model_variables['transcription_model']
#
# # Download the audio file from MinIO
# audio_data = minio_client.download_document_file(tenant_id, document_version.doc_id, document_version.language,
# document_version.id, input_file)
#
# # Load the audio data into pydub
# audio = AudioSegment.from_mp3(io.BytesIO(audio_data))
#
# # Define segment length (e.g., 10 minutes)
# segment_length = 10 * 60 * 1000 # 10 minutes in milliseconds
#
# transcriptions = []
#
# # Split audio into segments and transcribe each
# for i, chunk in enumerate(audio[::segment_length]):
# current_app.logger.debug(f'Transcribing chunk {i + 1} of {len(audio) // segment_length + 1}')
#
# with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
# chunk.export(temp_audio.name, format="mp3")
#
# with open(temp_audio.name, 'rb') as audio_segment:
# transcription = client.audio.transcriptions.create(
# file=audio_segment,
# model=model,
# language=document_version.language,
# response_format='verbose_json',
# )
#
# transcriptions.append(transcription.text)
#
# os.unlink(temp_audio.name) # Delete the temporary file
#
# # Combine all transcriptions
# full_transcription = " ".join(transcriptions)
#
# # Upload the full transcription to MinIO
# minio_client.upload_document_file(
# tenant_id,
# document_version.doc_id,
# document_version.language,
# document_version.id,
# output_file,
# full_transcription.encode('utf-8')
# )
#
# current_app.logger.info(f'Transcribed audio for tenant: {tenant_id}')
# except Exception as e:
# current_app.logger.error(f'Error transcribing audio for tenant: {tenant_id}, with error: {e}')
# raise
#
#
# def annotate_transcription(tenant, document_version, input_file, output_file, model_variables):
# try:
# current_app.logger.debug(f'Annotating transcription for tenant {tenant.id}')
#
# char_splitter = CharacterTextSplitter(separator='.',
# chunk_size=model_variables['annotation_chunk_length'],
# chunk_overlap=0)
#
# headers_to_split_on = [
# ("#", "Header 1"),
# ("##", "Header 2"),
# ]
# markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
#
# llm = model_variables['llm']
# template = model_variables['transcript_template']
# language_template = create_language_template(template, document_version.language)
# transcript_prompt = ChatPromptTemplate.from_template(language_template)
# setup = RunnablePassthrough()
# output_parser = StrOutputParser()
#
# # Download the transcription file from MinIO
# transcript_data = minio_client.download_document_file(tenant.id, document_version.doc_id,
# document_version.language, document_version.id,
# input_file)
# transcript = transcript_data.decode('utf-8')
#
# chain = setup | transcript_prompt | llm | output_parser
#
# chunks = char_splitter.split_text(transcript)
# all_markdown_chunks = []
# last_markdown_chunk = ''
# for chunk in chunks:
# current_app.logger.debug(f'Annotating next chunk of {len(chunks)} for tenant {tenant.id}')
# full_input = last_markdown_chunk + '\n' + chunk
# if tenant.embed_tuning:
# current_app.embed_tuning_logger.debug(f'Annotating chunk: \n '
# f'------------------\n'
# f'{full_input}\n'
# f'------------------\n')
# input_transcript = {'transcript': full_input}
# markdown = chain.invoke(input_transcript)
# # GPT-4o returns some kind of content description: ```markdown <text> ```
# if markdown.startswith("```markdown"):
# markdown = "\n".join(markdown.strip().split("\n")[1:-1])
# if tenant.embed_tuning:
# current_app.embed_tuning_logger.debug(f'Markdown Received: \n '
# f'------------------\n'
# f'{markdown}\n'
# f'------------------\n')
# md_header_splits = markdown_splitter.split_text(markdown)
# markdown_chunks = [doc.page_content for doc in md_header_splits]
# # claude-3.5-sonnet returns introductory text
# if not markdown_chunks[0].startswith('#'):
# markdown_chunks.pop(0)
# last_markdown_chunk = markdown_chunks[-1]
# last_markdown_chunk = "\n".join(markdown.strip().split("\n")[1:])
# markdown_chunks.pop()
# all_markdown_chunks += markdown_chunks
#
# all_markdown_chunks += [last_markdown_chunk]
#
# annotated_transcript = '\n'.join(all_markdown_chunks)
#
# # Upload the annotated transcript to MinIO
# minio_client.upload_document_file(
# tenant.id,
# document_version.doc_id,
# document_version.language,
# document_version.id,
# output_file,
# annotated_transcript.encode('utf-8')
# )
#
# current_app.logger.info(f'Annotated transcription for tenant {tenant.id}')
# except Exception as e:
# current_app.logger.error(f'Error annotating transcription for tenant {tenant.id}, with error: {e}')
# raise
def create_potential_chunks_for_markdown(tenant_id, document_version, input_file):
@@ -779,4 +561,3 @@ def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):
actual_chunks.append(current_chunk)
return actual_chunks