- Improve annotation algorithm for Youtube (and others)
- Patch Pytube - improve OS deletion of files and writing of files - Start working on Claude - Improve template management
This commit is contained in:
@@ -1,5 +1,7 @@
|
||||
import os
|
||||
from datetime import datetime as dt, timezone as tz
|
||||
import subprocess
|
||||
|
||||
|
||||
import gevent
|
||||
from bs4 import BeautifulSoup
|
||||
@@ -16,6 +18,8 @@ from langchain_core.runnables import RunnablePassthrough
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from pytube import YouTube
|
||||
import PyPDF2
|
||||
from pydub import AudioSegment
|
||||
import tempfile
|
||||
|
||||
from common.extensions import db
|
||||
from common.models.document import DocumentVersion, Embedding
|
||||
@@ -23,6 +27,7 @@ from common.models.user import Tenant
|
||||
from common.utils.celery_utils import current_celery
|
||||
from common.utils.database import Database
|
||||
from common.utils.model_utils import select_model_variables, create_language_template
|
||||
from common.utils.os_utils import safe_remove, sync_folder
|
||||
|
||||
|
||||
@current_celery.task(name='create_embeddings', queue='embeddings')
|
||||
@@ -193,7 +198,7 @@ def process_html(tenant, model_variables, document_version):
|
||||
create_embeddings.update_state(state=states.FAILURE)
|
||||
raise
|
||||
|
||||
extracted_html, title = parse_html(html_content, html_tags, included_elements=html_included_elements,
|
||||
extracted_html, title = parse_html(tenant, html_content, html_tags, included_elements=html_included_elements,
|
||||
excluded_elements=html_excluded_elements)
|
||||
extracted_file_name = f'{document_version.id}-extracted.html'
|
||||
output_file = os.path.join(base_path, extracted_file_name)
|
||||
@@ -345,7 +350,7 @@ def embed_chunks(tenant, model_variables, document_version, chunks):
|
||||
return new_embeddings
|
||||
|
||||
|
||||
def parse_html(html_content, tags, included_elements=None, excluded_elements=None):
|
||||
def parse_html(tenant, html_content, tags, included_elements=None, excluded_elements=None):
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
extracted_html = ''
|
||||
|
||||
@@ -354,18 +359,22 @@ def parse_html(html_content, tags, included_elements=None, excluded_elements=Non
|
||||
else:
|
||||
elements_to_parse = [soup] # parse the entire document if no included_elements specified
|
||||
|
||||
current_app.embed_tuning_logger.debug(f'Included Elements: {included_elements}')
|
||||
current_app.embed_tuning_logger.debug(f'Included Elements: {len(included_elements)}')
|
||||
current_app.embed_tuning_logger.debug(f'Excluded Elements: {excluded_elements}')
|
||||
current_app.embed_tuning_logger.debug(f'Found {len(elements_to_parse)} elements to parse')
|
||||
if tenant.embed_tuning:
|
||||
current_app.embed_tuning_logger.debug(f'Tags to parse: {tags}')
|
||||
current_app.embed_tuning_logger.debug(f'Included Elements: {included_elements}')
|
||||
current_app.embed_tuning_logger.debug(f'Included Elements: {len(included_elements)}')
|
||||
current_app.embed_tuning_logger.debug(f'Excluded Elements: {excluded_elements}')
|
||||
current_app.embed_tuning_logger.debug(f'Found {len(elements_to_parse)} elements to parse')
|
||||
current_app.embed_tuning_logger.debug(f'First element to parse: {elements_to_parse[0]}')
|
||||
|
||||
# Iterate through the found included elements
|
||||
for element in elements_to_parse:
|
||||
# Find all specified tags within each included element
|
||||
for sub_element in element.find_all(tags):
|
||||
if tenant.embed_tuning:
|
||||
current_app.embed_tuning_logger.debug(f'Found element: {sub_element.name}')
|
||||
if excluded_elements and sub_element.find_parent(excluded_elements):
|
||||
continue # Skip this sub_element if it's within any of the excluded_elements
|
||||
sub_content = html.unescape(sub_element.get_text(strip=False))
|
||||
extracted_html += f'<{sub_element.name}>{sub_element.get_text(strip=True)}</{sub_element.name}>\n'
|
||||
|
||||
title = soup.find('title').get_text(strip=True)
|
||||
@@ -381,11 +390,20 @@ def process_youtube(tenant, model_variables, document_version):
|
||||
transcription_file_name = f'{document_version.id}.txt'
|
||||
markdown_file_name = f'{document_version.id}.md'
|
||||
|
||||
# Remove existing files (in case of a re-processing of the file
|
||||
safe_remove(os.path.join(base_path, download_file_name))
|
||||
safe_remove(os.path.join(base_path, compressed_file_name))
|
||||
safe_remove(os.path.join(base_path, transcription_file_name))
|
||||
safe_remove(os.path.join(base_path, markdown_file_name))
|
||||
sync_folder(base_path)
|
||||
|
||||
of, title, description, author = download_youtube(document_version.url, base_path, download_file_name, tenant)
|
||||
document_version.system_context = f'Title: {title}\nDescription: {description}\nAuthor: {author}'
|
||||
compress_audio(base_path, download_file_name, compressed_file_name, tenant)
|
||||
transcribe_audio(base_path, compressed_file_name, transcription_file_name, document_version.language, tenant, model_variables)
|
||||
annotate_transcription(base_path, transcription_file_name, markdown_file_name, tenant, model_variables)
|
||||
transcribe_audio(base_path, compressed_file_name, transcription_file_name,
|
||||
document_version.language, tenant, model_variables)
|
||||
annotate_transcription(base_path, transcription_file_name, markdown_file_name,
|
||||
document_version.language, tenant, model_variables)
|
||||
|
||||
potential_chunks = create_potential_chunks_for_markdown(base_path, markdown_file_name, tenant)
|
||||
actual_chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
|
||||
@@ -426,15 +444,41 @@ def download_youtube(url, file_location, file_name, tenant):
|
||||
def compress_audio(file_location, input_file, output_file, tenant):
|
||||
try:
|
||||
current_app.logger.info(f'Compressing audio on {file_location} for tenant: {tenant.id}')
|
||||
result = os.popen(f'scripts/compress.sh -d {file_location} -i {input_file} -o {output_file}')
|
||||
|
||||
# Run the compression script
|
||||
result = subprocess.run(
|
||||
['scripts/compress.sh', '-d', file_location, '-i', input_file, '-o', output_file],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise Exception(f"Compression failed: {result.stderr}")
|
||||
|
||||
output_file_path = os.path.join(file_location, output_file)
|
||||
count = 0
|
||||
while not os.path.exists(output_file_path) and count < 10:
|
||||
gevent.sleep(1)
|
||||
current_app.logger.debug(f'Waiting for {output_file_path} to be created... Count: {count}')
|
||||
count += 1
|
||||
|
||||
# Additional check for file stability
|
||||
previous_size = -1
|
||||
stable_count = 0
|
||||
max_attempts = 12 # 1 minute total wait time
|
||||
|
||||
for _ in range(max_attempts):
|
||||
if os.path.exists(output_file_path):
|
||||
current_size = os.path.getsize(output_file_path)
|
||||
if current_size == previous_size:
|
||||
stable_count += 1
|
||||
if stable_count >= 3: # File size hasn't changed for 3 checks
|
||||
break
|
||||
else:
|
||||
stable_count = 0
|
||||
previous_size = current_size
|
||||
gevent.sleep(5)
|
||||
|
||||
if stable_count < 3:
|
||||
raise Exception("File size did not stabilize within the expected time")
|
||||
|
||||
current_app.logger.info(f'Compressed audio for {file_location} for tenant: {tenant.id}')
|
||||
return result
|
||||
return output_file_path
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Error compressing audio on {file_location} for tenant: {tenant.id} with error: {e}')
|
||||
raise
|
||||
@@ -448,22 +492,47 @@ def transcribe_audio(file_location, input_file, output_file, language, tenant, m
|
||||
input_file_path = os.path.join(file_location, input_file)
|
||||
output_file_path = os.path.join(file_location, output_file)
|
||||
|
||||
# Wait for the input file to exist
|
||||
count = 0
|
||||
while not os.path.exists(input_file_path) and count < 10:
|
||||
gevent.sleep(1)
|
||||
current_app.logger.debug(f'Waiting for {input_file_path} to exist... Count: {count}')
|
||||
count += 1
|
||||
|
||||
with open(input_file_path, 'rb') as audio_file:
|
||||
transcription = client.audio.transcriptions.create(
|
||||
file=audio_file,
|
||||
model=model,
|
||||
language=language,
|
||||
response_format='verbose_json',
|
||||
)
|
||||
if not os.path.exists(input_file_path):
|
||||
raise FileNotFoundError(f"Input file {input_file_path} not found after waiting.")
|
||||
|
||||
with open(output_file_path, 'w') as transcript_file:
|
||||
transcript_file.write(transcription.text)
|
||||
# Load the audio file
|
||||
audio = AudioSegment.from_file(input_file_path)
|
||||
|
||||
# Define segment length (e.g., 10 minutes)
|
||||
segment_length = 10 * 60 * 1000 # 10 minutes in milliseconds
|
||||
|
||||
transcriptions = []
|
||||
|
||||
# Split audio into segments and transcribe each
|
||||
for i, chunk in enumerate(audio[::segment_length]):
|
||||
current_app.logger.debug(f'Transcribing chunk {i} of {len(audio) // segment_length} ')
|
||||
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
|
||||
chunk.export(temp_audio.name, format="mp3")
|
||||
with open(temp_audio.name, 'rb') as audio_segment:
|
||||
transcription = client.audio.transcriptions.create(
|
||||
file=audio_segment,
|
||||
model=model,
|
||||
language=language,
|
||||
response_format='verbose_json',
|
||||
)
|
||||
|
||||
transcriptions.append(transcription.text)
|
||||
|
||||
os.unlink(temp_audio.name) # Delete the temporary file
|
||||
|
||||
# Combine all transcriptions
|
||||
full_transcription = " ".join(transcriptions)
|
||||
|
||||
# Write the full transcription to the output file
|
||||
with open(output_file_path, 'w') as f:
|
||||
f.write(full_transcription)
|
||||
|
||||
current_app.logger.info(f'Transcribed audio for {file_location} for tenant: {tenant.id}')
|
||||
except Exception as e:
|
||||
@@ -472,23 +541,67 @@ def transcribe_audio(file_location, input_file, output_file, language, tenant, m
|
||||
raise
|
||||
|
||||
|
||||
def annotate_transcription(file_location, input_file, output_file, tenant, model_variables):
|
||||
def annotate_transcription(file_location, input_file, output_file, language, tenant, model_variables):
|
||||
try:
|
||||
current_app.logger.debug(f'Annotating transcription on {file_location} for tenant {tenant.id}')
|
||||
llm = model_variables['llm']
|
||||
|
||||
char_splitter = CharacterTextSplitter(separator='.',
|
||||
chunk_size=model_variables['annotation_chunk_length'],
|
||||
chunk_overlap=0)
|
||||
|
||||
headers_to_split_on = [
|
||||
("#", "Header 1"),
|
||||
("##", "Header 2"),
|
||||
# ("###", "Header 3"),
|
||||
]
|
||||
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
|
||||
|
||||
llm = model_variables['llm']
|
||||
template = model_variables['transcript_template']
|
||||
transcript_prompt = ChatPromptTemplate.from_template(template)
|
||||
language_template = create_language_template(template, language)
|
||||
transcript_prompt = ChatPromptTemplate.from_template(language_template)
|
||||
setup = RunnablePassthrough()
|
||||
output_parser = StrOutputParser()
|
||||
transcript = ''
|
||||
with open(os.path.join(file_location, input_file), 'r') as f:
|
||||
transcript = f.read()
|
||||
|
||||
chain = setup | transcript_prompt | llm | output_parser
|
||||
input_transcript = {"transcript": transcript}
|
||||
|
||||
annotated_transcript = chain.invoke(input_transcript)
|
||||
chunks = char_splitter.split_text(transcript)
|
||||
all_markdown_chunks = []
|
||||
last_markdown_chunk = ''
|
||||
for chunk in chunks:
|
||||
current_app.logger.debug(f'Annotating next chunk of {len(chunks)} for tenant {tenant.id}')
|
||||
full_input = last_markdown_chunk + '\n' + chunk
|
||||
if tenant.embed_tuning:
|
||||
current_app.embed_tuning_logger.debug(f'Annotating chunk: \n '
|
||||
f'------------------\n'
|
||||
f'{full_input}\n'
|
||||
f'------------------\n')
|
||||
input_transcript = {'transcript': full_input}
|
||||
markdown = chain.invoke(input_transcript)
|
||||
# GPT-4o returns some kind of content description: ```markdown <text> ```
|
||||
if markdown.startswith("```markdown"):
|
||||
markdown = "\n".join(markdown.strip().split("\n")[1:-1])
|
||||
if tenant.embed_tuning:
|
||||
current_app.embed_tuning_logger.debug(f'Markdown Received: \n '
|
||||
f'------------------\n'
|
||||
f'{markdown}\n'
|
||||
f'------------------\n')
|
||||
md_header_splits = markdown_splitter.split_text(markdown)
|
||||
markdown_chunks = [doc.page_content for doc in md_header_splits]
|
||||
# claude-3.5-sonnet returns introductory text
|
||||
if not markdown_chunks[0].startswith('#'):
|
||||
markdown_chunks.pop(0)
|
||||
last_markdown_chunk = markdown_chunks[-1]
|
||||
last_markdown_chunk = "\n".join(markdown.strip().split("\n")[1:])
|
||||
markdown_chunks.pop()
|
||||
all_markdown_chunks += markdown_chunks
|
||||
|
||||
|
||||
all_markdown_chunks += [last_markdown_chunk]
|
||||
|
||||
annotated_transcript = '\n'.join(all_markdown_chunks)
|
||||
|
||||
with open(os.path.join(file_location, output_file), 'w') as f:
|
||||
f.write(annotated_transcript)
|
||||
|
||||
Reference in New Issue
Block a user