Youtube added - further checking required
This commit is contained in:
@@ -1,19 +1,24 @@
|
||||
import os
|
||||
from datetime import datetime as dt, timezone as tz
|
||||
|
||||
import gevent
|
||||
from bs4 import BeautifulSoup
|
||||
import html
|
||||
from celery import states
|
||||
from flask import current_app
|
||||
# OpenAI imports
|
||||
from langchain.chains.summarize import load_summarize_chain
|
||||
from langchain.text_splitter import CharacterTextSplitter
|
||||
from langchain.text_splitter import CharacterTextSplitter, MarkdownHeaderTextSplitter
|
||||
from langchain_core.exceptions import LangChainException
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.prompts import ChatPromptTemplate
|
||||
from langchain_core.runnables import RunnablePassthrough
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
# Unstructured commercial client imports
|
||||
from unstructured_client import UnstructuredClient
|
||||
from unstructured_client.models import shared
|
||||
from unstructured_client.models.errors import SDKError
|
||||
from pytube import YouTube
|
||||
|
||||
from common.extensions import db
|
||||
from common.models.document import DocumentVersion, Embedding
|
||||
@@ -80,6 +85,8 @@ def create_embeddings(tenant_id, document_version_id):
|
||||
process_pdf(tenant, model_variables, document_version)
|
||||
case 'html':
|
||||
process_html(tenant, model_variables, document_version)
|
||||
case 'youtube':
|
||||
process_youtube(tenant, model_variables, document_version)
|
||||
case _:
|
||||
raise Exception(f'No functionality defined for file type {document_version.file_type} '
|
||||
f'for tenant {tenant_id} '
|
||||
@@ -200,7 +207,7 @@ def process_html(tenant, model_variables, document_version):
|
||||
if len(chunks) > 1:
|
||||
summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
|
||||
document_version.system_context = (f'Title: {title}\n'
|
||||
f'Summary: {summary}\n')
|
||||
f'Summary: {summary}\n')
|
||||
else:
|
||||
document_version.system_context = (f'Title: {title}\n')
|
||||
|
||||
@@ -408,3 +415,178 @@ def combine_chunks(potential_chunks, min_chars, max_chars):
|
||||
actual_chunks.append(current_chunk)
|
||||
|
||||
return actual_chunks
|
||||
|
||||
|
||||
def process_youtube(tenant, model_variables, document_version):
|
||||
base_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
|
||||
document_version.file_location)
|
||||
# clean old files if necessary
|
||||
|
||||
of, title, description, author = download_youtube(document_version.url, base_path, 'downloaded.mp4', tenant)
|
||||
document_version.system_context = f'Title: {title}\nDescription: {description}\nAuthor: {author}'
|
||||
compress_audio(base_path, 'downloaded.mp4', 'compressed.mp3', tenant)
|
||||
transcribe_audio(base_path, 'compressed.mp3', 'transcription.txt', document_version.language, tenant, model_variables)
|
||||
annotate_transcription(base_path, 'transcription.txt', 'transcription.md', tenant, model_variables)
|
||||
|
||||
potential_chunks = create_potential_chunks_for_markdown(base_path, 'transcription.md', tenant)
|
||||
actual_chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
|
||||
model_variables['max_chunk_size'])
|
||||
enriched_chunks = enrich_chunks(tenant, document_version, actual_chunks)
|
||||
embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
|
||||
|
||||
try:
|
||||
db.session.add(document_version)
|
||||
document_version.processing_finished_at = dt.now(tz.utc)
|
||||
document_version.processing = False
|
||||
db.session.add_all(embeddings)
|
||||
db.session.commit()
|
||||
except SQLAlchemyError as e:
|
||||
current_app.logger.error(f'Error saving embedding information for tenant {tenant.id} '
|
||||
f'on Youtube document version {document_version.id}'
|
||||
f'error: {e}')
|
||||
raise
|
||||
|
||||
current_app.logger.info(f'Embeddings created successfully for tenant {tenant.id} '
|
||||
f'on Youtube document version {document_version.id} :-)')
|
||||
|
||||
|
||||
def download_youtube(url, file_location, file_name, tenant):
|
||||
try:
|
||||
current_app.logger.info(f'Downloading YouTube video: {url} on location {file_location} for tenant: {tenant.id}')
|
||||
yt = YouTube(url)
|
||||
stream = yt.streams.get_audio_only()
|
||||
output_file = stream.download(output_path=file_location, filename=file_name)
|
||||
current_app.logger.info(f'Downloaded YouTube video: {url} on location {file_location} for tenant: {tenant.id}')
|
||||
return output_file, yt.title, yt.description, yt.author
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Error downloading YouTube video: {url} on location {file_location} for '
|
||||
f'tenant: {tenant.id} with error: {e}')
|
||||
raise
|
||||
|
||||
|
||||
def compress_audio(file_location, input_file, output_file, tenant):
|
||||
try:
|
||||
current_app.logger.info(f'Compressing audio on {file_location} for tenant: {tenant.id}')
|
||||
result = os.popen(f'scripts/compress.sh -d {file_location} -i {input_file} -o {output_file}')
|
||||
output_file_path = os.path.join(file_location, output_file)
|
||||
count = 0
|
||||
while not os.path.exists(output_file_path) and count < 10:
|
||||
gevent.sleep(1)
|
||||
current_app.logger.debug(f'Waiting for {output_file_path} to be created... Count: {count}')
|
||||
count += 1
|
||||
current_app.logger.info(f'Compressed audio for {file_location} for tenant: {tenant.id}')
|
||||
return result
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Error compressing audio on {file_location} for tenant: {tenant.id} with error: {e}')
|
||||
raise
|
||||
|
||||
|
||||
def transcribe_audio(file_location, input_file, output_file, language, tenant, model_variables):
|
||||
try:
|
||||
current_app.logger.info(f'Transcribing audio on {file_location} for tenant: {tenant.id}')
|
||||
client = model_variables['transcription_client']
|
||||
model = model_variables['transcription_model']
|
||||
input_file_path = os.path.join(file_location, input_file)
|
||||
output_file_path = os.path.join(file_location, output_file)
|
||||
|
||||
count = 0
|
||||
while not os.path.exists(input_file_path) and count < 10:
|
||||
gevent.sleep(1)
|
||||
current_app.logger.debug(f'Waiting for {input_file_path} to exist... Count: {count}')
|
||||
count += 1
|
||||
|
||||
with open(input_file_path, 'rb') as audio_file:
|
||||
transcription = client.audio.transcriptions.create(
|
||||
file=audio_file,
|
||||
model=model,
|
||||
language=language,
|
||||
response_format='verbose_json',
|
||||
)
|
||||
|
||||
with open(output_file_path, 'w') as transcript_file:
|
||||
transcript_file.write(transcription.text)
|
||||
|
||||
current_app.logger.info(f'Transcribed audio for {file_location} for tenant: {tenant.id}')
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Error transcribing audio for {file_location} for tenant: {tenant.id}, '
|
||||
f'with error: {e}')
|
||||
raise
|
||||
|
||||
|
||||
def annotate_transcription(file_location, input_file, output_file, tenant, model_variables):
|
||||
try:
|
||||
current_app.logger.debug(f'Annotating transcription on {file_location} for tenant {tenant.id}')
|
||||
llm = model_variables['llm']
|
||||
|
||||
template = model_variables['transcript_template']
|
||||
transcript_prompt = ChatPromptTemplate.from_template(template)
|
||||
setup = RunnablePassthrough()
|
||||
output_parser = StrOutputParser()
|
||||
transcript = ''
|
||||
with open(os.path.join(file_location, input_file), 'r') as f:
|
||||
transcript = f.read()
|
||||
|
||||
chain = setup | transcript_prompt | llm | output_parser
|
||||
input_transcript = {"transcript": transcript}
|
||||
|
||||
annotated_transcript = chain.invoke(input_transcript)
|
||||
|
||||
with open(os.path.join(file_location, output_file), 'w') as f:
|
||||
f.write(annotated_transcript)
|
||||
|
||||
current_app.logger.info(f'Annotated transcription for {file_location} for tenant {tenant.id}')
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Error annotating transcription for {file_location} for tenant {tenant.id}, '
|
||||
f'with error: {e}')
|
||||
raise
|
||||
|
||||
|
||||
def create_potential_chunks_for_markdown(base_path, input_file, tenant):
|
||||
current_app.logger.info(f'Creating potential chunks for {base_path} for tenant {tenant.id}')
|
||||
markdown = ''
|
||||
with open(os.path.join(base_path, input_file), 'r') as f:
|
||||
markdown = f.read()
|
||||
|
||||
headers_to_split_on = [
|
||||
("#", "Header 1"),
|
||||
("##", "Header 2"),
|
||||
# ("###", "Header 3"),
|
||||
]
|
||||
|
||||
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
|
||||
md_header_splits = markdown_splitter.split_text(markdown)
|
||||
potential_chunks = [doc.page_content for doc in md_header_splits]
|
||||
|
||||
return potential_chunks
|
||||
|
||||
|
||||
def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):
|
||||
actual_chunks = []
|
||||
current_chunk = ""
|
||||
current_length = 0
|
||||
|
||||
for chunk in potential_chunks:
|
||||
chunk_length = len(chunk)
|
||||
|
||||
if current_length + chunk_length > max_chars:
|
||||
if current_length >= min_chars:
|
||||
actual_chunks.append(current_chunk)
|
||||
current_chunk = chunk
|
||||
current_length = chunk_length
|
||||
else:
|
||||
# If the combined chunk is still less than max_chars, keep adding
|
||||
current_chunk += f'\n{chunk}'
|
||||
current_length += chunk_length
|
||||
else:
|
||||
current_chunk += f'\n{chunk}'
|
||||
current_length += chunk_length
|
||||
|
||||
# Handle the last chunk
|
||||
if current_chunk and current_length >= 0:
|
||||
actual_chunks.append(current_chunk)
|
||||
|
||||
return actual_chunks
|
||||
pass
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user