Youtube added - further checking required
This commit is contained in:
2
.idea/eveAI.iml
generated
2
.idea/eveAI.iml
generated
@@ -8,7 +8,7 @@
|
||||
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/.venv2" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 3.12 (eveAI)" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.12 (eveai_dev)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="TemplatesService">
|
||||
|
||||
2
.idea/misc.xml
generated
2
.idea/misc.xml
generated
@@ -3,5 +3,5 @@
|
||||
<component name="Black">
|
||||
<option name="sdkName" value="Python 3.12 (eveAI)" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (eveAI)" project-jdk-type="Python SDK" />
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (eveai_dev)" project-jdk-type="Python SDK" />
|
||||
</project>
|
||||
@@ -6,6 +6,7 @@ from langchain_core.pydantic_v1 import BaseModel, Field
|
||||
from langchain.prompts import ChatPromptTemplate
|
||||
import ast
|
||||
from typing import List
|
||||
from openai import OpenAI
|
||||
|
||||
from common.models.document import EmbeddingSmallOpenAI
|
||||
|
||||
@@ -117,12 +118,14 @@ def select_model_variables(tenant):
|
||||
rag_template = current_app.config.get('GPT4_RAG_TEMPLATE')
|
||||
history_template = current_app.config.get('GPT4_HISTORY_TEMPLATE')
|
||||
encyclopedia_template = current_app.config.get('GPT4_ENCYCLOPEDIA_TEMPLATE')
|
||||
transcript_template = current_app.config.get('GPT4_TRANSCRIPT_TEMPLATE')
|
||||
tool_calling_supported = True
|
||||
case 'gpt-3-5-turbo':
|
||||
summary_template = current_app.config.get('GPT3_5_SUMMARY_TEMPLATE')
|
||||
rag_template = current_app.config.get('GPT3_5_RAG_TEMPLATE')
|
||||
history_template = current_app.config.get('GPT3_5_HISTORY_TEMPLATE')
|
||||
encyclopedia_template = current_app.config.get('GPT3_5_ENCYCLOPEDIA_TEMPLATE')
|
||||
transcript_template = current_app.config.get('GPT3_5_TRANSCRIPT_TEMPLATE')
|
||||
case _:
|
||||
raise Exception(f'Error setting model variables for tenant {tenant.id} '
|
||||
f'error: Invalid chat model')
|
||||
@@ -130,12 +133,18 @@ def select_model_variables(tenant):
|
||||
model_variables['rag_template'] = rag_template
|
||||
model_variables['history_template'] = history_template
|
||||
model_variables['encyclopedia_template'] = encyclopedia_template
|
||||
model_variables['transcript_template'] = transcript_template
|
||||
if tool_calling_supported:
|
||||
model_variables['cited_answer_cls'] = CitedAnswer
|
||||
case _:
|
||||
raise Exception(f'Error setting model variables for tenant {tenant.id} '
|
||||
f'error: Invalid chat provider')
|
||||
|
||||
# Transcription Client Variables. Only Whisper-1 of OpenAI is currently supported
|
||||
api_key = current_app.config.get('OPENAI_API_KEY')
|
||||
model_variables['transcription_client'] = OpenAI(api_key=api_key)
|
||||
model_variables['transcription_model'] = 'whisper-1'
|
||||
|
||||
return model_variables
|
||||
|
||||
|
||||
|
||||
@@ -58,7 +58,7 @@ class Config(object):
|
||||
SUPPORTED_LANGUAGES = ['en', 'fr', 'nl', 'de', 'es']
|
||||
|
||||
# supported LLMs
|
||||
SUPPORTED_EMBEDDINGS = ['openai.text-embedding-3-small', 'mistral.mistral-embed']
|
||||
SUPPORTED_EMBEDDINGS = ['openai.text-embedding-3-small', 'openai.text-embedding-3-large', 'mistral.mistral-embed']
|
||||
SUPPORTED_LLMS = ['openai.gpt-4o', 'openai.gpt-4-turbo', 'openai.gpt-3.5-turbo', 'mistral.mistral-large-2402']
|
||||
|
||||
# Celery settings
|
||||
@@ -123,6 +123,32 @@ class Config(object):
|
||||
Question:
|
||||
{question}"""
|
||||
|
||||
GPT3_5_ENCYCLOPEDIA_TEMPLATE = """You have a lot of background knowledge, and as such you are some kind of
|
||||
'encyclopedia' to explain general terminology. Only answer if you have a clear understanding of the question.
|
||||
If not, say you do not have sufficient information to answer the question. Use the {language} in your communication.
|
||||
Question:
|
||||
{question}"""
|
||||
|
||||
GPT4_TRANSCRIPT_TEMPLATE = """You are a transcription editor that improves a given transcript on several parts
|
||||
and returns markdown. Without changing what people say. The transcript is delimited between triple backquotes.
|
||||
Do the following:
|
||||
- divide the transcript into several logical parts. Ensure questions and their answers are in the same logical part.
|
||||
- annotate the text to identify these logical parts using headings (max 2 levels) in the same language as the transcript.
|
||||
- improve errors in the transcript given the context, but leave the text intact.
|
||||
|
||||
```{transcript}```
|
||||
"""
|
||||
|
||||
GPT3_5_TRANSCRIPT_TEMPLATE = """You are a transcription editor that improves a given transcript on several parts
|
||||
and returns markdown. Without changing what people say. The transcript is delimited between triple backquotes.
|
||||
Do the following:
|
||||
- divide the transcript into several logical parts. Ensure questions and their answers are in the same logical part.
|
||||
- annotate the text to identify these logical parts using headings (max 2 levels) in the same language as the transcript.
|
||||
- improve errors in the transcript given the context, but leave the text intact.
|
||||
|
||||
```{transcript}```
|
||||
"""
|
||||
|
||||
# SocketIO settings
|
||||
# SOCKETIO_ASYNC_MODE = 'threading'
|
||||
SOCKETIO_ASYNC_MODE = 'gevent'
|
||||
@@ -182,6 +208,9 @@ class DevConfig(Config):
|
||||
# OpenAI API Keys
|
||||
OPENAI_API_KEY = 'sk-proj-8R0jWzwjL7PeoPyMhJTZT3BlbkFJLb6HfRB2Hr9cEVFWEhU7'
|
||||
|
||||
# Groq API Keys
|
||||
GROQ_API_KEY = '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'
|
||||
|
||||
# Unstructured settings
|
||||
UNSTRUCTURED_API_KEY = 'pDgCrXumYhM3CNvjvwV8msMldXC3uw'
|
||||
UNSTRUCTURED_BASE_URL = 'https://flowitbv-16c4us0m.api.unstructuredapp.io'
|
||||
@@ -209,6 +238,9 @@ class DevConfig(Config):
|
||||
# Session settings
|
||||
SESSION_REDIS = redis.from_url('redis://redis:6379/2')
|
||||
|
||||
# PATH settings
|
||||
ffmpeg_path = '/usr/bin/ffmpeg'
|
||||
|
||||
|
||||
class ProdConfig(Config):
|
||||
DEVELOPMENT = False
|
||||
|
||||
BIN
docker/.DS_Store
vendored
BIN
docker/.DS_Store
vendored
Binary file not shown.
BIN
docker/db/.DS_Store
vendored
BIN
docker/db/.DS_Store
vendored
Binary file not shown.
@@ -27,9 +27,12 @@ RUN apt-get update && apt-get install -y \
|
||||
build-essential \
|
||||
gcc \
|
||||
postgresql-client \
|
||||
ffmpeg \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Python dependencies.
|
||||
|
||||
# Download dependencies as a separate step to take advantage of Docker's caching.
|
||||
# Leverage a cache mount to /root/.cache/pip to speed up subsequent builds.
|
||||
# Leverage a bind mount to requirements.txt to avoid having to copy them into
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
{% block content_description %}Add a url and the corresponding document to EveAI. In some cases, url's cannot be loaded directly. Download the html and add it as a document in that case.{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<form method="post" enctype="multipart/form-data">
|
||||
<form method="post">
|
||||
{{ form.hidden_tag() }}
|
||||
{% set disabled_fields = [] %}
|
||||
{% set exclude_fields = [] %}
|
||||
|
||||
24
eveai_app/templates/document/add_youtube.html
Normal file
24
eveai_app/templates/document/add_youtube.html
Normal file
@@ -0,0 +1,24 @@
|
||||
{% extends 'base.html' %}
|
||||
{% from "macros.html" import render_field %}
|
||||
|
||||
{% block title %}Add Youtube Document{% endblock %}
|
||||
|
||||
{% block content_title %}Add Youtube Document{% endblock %}
|
||||
{% block content_description %}Add a youtube url and the corresponding document to EveAI. In some cases, url's cannot be loaded directly. Download the html and add it as a document in that case.{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<form method="post">
|
||||
{{ form.hidden_tag() }}
|
||||
{% set disabled_fields = [] %}
|
||||
{% set exclude_fields = [] %}
|
||||
{% for field in form %}
|
||||
{{ render_field(field, disabled_fields, exclude_fields) }}
|
||||
{% endfor %}
|
||||
<button type="submit" class="btn btn-primary">Add Youtube Document</button>
|
||||
</form>
|
||||
{% endblock %}
|
||||
|
||||
|
||||
{% block content_footer %}
|
||||
|
||||
{% endblock %}
|
||||
@@ -83,6 +83,7 @@
|
||||
{{ dropdown('Document Mgmt', 'contacts', [
|
||||
{'name': 'Add Document', 'url': '/document/add_document', 'roles': ['Super User', 'Tenant Admin']},
|
||||
{'name': 'Add URL', 'url': '/document/add_url', 'roles': ['Super User', 'Tenant Admin']},
|
||||
{'name': 'Add Youtube Document' , 'url': '/document/add_youtube', 'roles': ['Super User', 'Tenant Admin']},
|
||||
{'name': 'All Documents', 'url': '/document/documents', 'roles': ['Super User', 'Tenant Admin']},
|
||||
{'name': 'Library Operations', 'url': '/document/library_operations', 'roles': ['Super User', 'Tenant Admin']},
|
||||
]) }}
|
||||
|
||||
@@ -20,7 +20,6 @@ class AddDocumentForm(FlaskForm):
|
||||
super().__init__()
|
||||
self.language.choices = [(language, language) for language in
|
||||
session.get('tenant').get('allowed_languages')]
|
||||
self.language.data = session.get('default_language')
|
||||
|
||||
|
||||
class AddURLForm(FlaskForm):
|
||||
@@ -36,7 +35,21 @@ class AddURLForm(FlaskForm):
|
||||
super().__init__()
|
||||
self.language.choices = [(language, language) for language in
|
||||
session.get('tenant').get('allowed_languages')]
|
||||
self.language.data = session.get('default_language')
|
||||
|
||||
|
||||
class AddYoutubeForm(FlaskForm):
|
||||
url = URLField('Youtube URL', validators=[DataRequired(), URL()])
|
||||
name = StringField('Name', validators=[Length(max=100)])
|
||||
language = SelectField('Language', choices=[], validators=[Optional()])
|
||||
user_context = TextAreaField('User Context', validators=[Optional()])
|
||||
valid_from = DateField('Valid from', id='form-control datepicker', validators=[Optional()])
|
||||
|
||||
submit = SubmitField('Submit')
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.language.choices = [(language, language) for language in
|
||||
session.get('tenant').get('allowed_languages')]
|
||||
|
||||
|
||||
class EditDocumentForm(FlaskForm):
|
||||
|
||||
@@ -17,7 +17,7 @@ import io
|
||||
|
||||
from common.models.document import Document, DocumentVersion
|
||||
from common.extensions import db
|
||||
from .document_forms import AddDocumentForm, AddURLForm, EditDocumentForm, EditDocumentVersionForm
|
||||
from .document_forms import AddDocumentForm, AddURLForm, EditDocumentForm, EditDocumentVersionForm, AddYoutubeForm
|
||||
from common.utils.middleware import mw_before_request
|
||||
from common.utils.celery_utils import current_celery
|
||||
from common.utils.nginx_utils import prefixed_url_for
|
||||
@@ -88,7 +88,7 @@ def add_url():
|
||||
|
||||
# If the form is submitted
|
||||
if form.validate_on_submit():
|
||||
current_app.logger.info(f'Adding document for tenant {session["tenant"]["id"]}')
|
||||
current_app.logger.info(f'Adding url for tenant {session["tenant"]["id"]}')
|
||||
url = form.url.data
|
||||
|
||||
doc_vers = DocumentVersion.query.filter_by(url=url).all()
|
||||
@@ -129,6 +129,50 @@ def add_url():
|
||||
return render_template('document/add_url.html', form=form)
|
||||
|
||||
|
||||
@document_bp.route('/add_youtube', methods=['GET', 'POST'])
|
||||
@roles_accepted('Super User', 'Tenant Admin')
|
||||
def add_youtube():
|
||||
form = AddYoutubeForm()
|
||||
|
||||
if form.validate_on_submit():
|
||||
current_app.logger.info(f'Adding Youtube document for tenant {session["tenant"]["id"]}')
|
||||
url = form.url.data
|
||||
current_app.logger.debug(f'Value of language field: {form.language.data}')
|
||||
|
||||
doc_vers = DocumentVersion.query.filter_by(url=url).all()
|
||||
if doc_vers:
|
||||
current_app.logger.info(f'A document with url {url} already exists. No new document created.')
|
||||
flash(f'A document with url {url} already exists. No new document created.', 'info')
|
||||
return redirect(prefixed_url_for('document_bp.documents'))
|
||||
# As downloading a Youtube document can take quite some time, we offload this downloading to the worker
|
||||
# We just pass a simple file to get things conform
|
||||
file = "Youtube placeholder file"
|
||||
|
||||
filename = 'placeholder.youtube'
|
||||
extension = 'youtube'
|
||||
form_dict = form_to_dict(form)
|
||||
current_app.logger.debug(f'Form data: {form_dict}')
|
||||
|
||||
new_doc, new_doc_vers = create_document_stack(form_dict, file, filename, extension)
|
||||
|
||||
task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
|
||||
session['tenant']['id'],
|
||||
new_doc_vers.id,
|
||||
])
|
||||
current_app.logger.info(f'Processing and Embedding on Youtube document started for tenant '
|
||||
f'{session["tenant"]["id"]}, '
|
||||
f'Document Version {new_doc_vers.id}. '
|
||||
f'Processing and Embedding Youtube task: {task.id}')
|
||||
flash(f'Processing on Youtube document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task.id}.',
|
||||
'success')
|
||||
|
||||
return redirect(prefixed_url_for('document_bp.documents'))
|
||||
else:
|
||||
form_validation_failed(request, form)
|
||||
|
||||
return render_template('document/add_youtube.html', form=form)
|
||||
|
||||
|
||||
@document_bp.route('/documents', methods=['GET', 'POST'])
|
||||
@roles_accepted('Super User', 'Tenant Admin')
|
||||
def documents():
|
||||
@@ -381,7 +425,11 @@ def create_document_stack(form, file, filename, extension):
|
||||
new_doc = create_document(form, filename)
|
||||
|
||||
# Create the DocumentVersion
|
||||
new_doc_vers = create_version_for_document(new_doc, form.get('url', ''), form['language'], form['user_context'])
|
||||
new_doc_vers = create_version_for_document(new_doc,
|
||||
form.get('url', ''),
|
||||
form.get('language', 'en'),
|
||||
form.get('user_context', '')
|
||||
)
|
||||
|
||||
try:
|
||||
db.session.add(new_doc)
|
||||
@@ -462,6 +510,10 @@ def upload_file_for_version(doc_vers, file, extension):
|
||||
# Example: write content to a file manually
|
||||
with open(os.path.join(upload_path, doc_vers.file_name), 'wb') as f:
|
||||
f.write(file.getvalue())
|
||||
elif isinstance(file, str):
|
||||
# It's a string, handle accordingly
|
||||
with open(os.path.join(upload_path, doc_vers.file_name), 'w') as f:
|
||||
f.write(file)
|
||||
else:
|
||||
raise TypeError('Unsupported file type.')
|
||||
|
||||
|
||||
@@ -1,19 +1,24 @@
|
||||
import os
|
||||
from datetime import datetime as dt, timezone as tz
|
||||
|
||||
import gevent
|
||||
from bs4 import BeautifulSoup
|
||||
import html
|
||||
from celery import states
|
||||
from flask import current_app
|
||||
# OpenAI imports
|
||||
from langchain.chains.summarize import load_summarize_chain
|
||||
from langchain.text_splitter import CharacterTextSplitter
|
||||
from langchain.text_splitter import CharacterTextSplitter, MarkdownHeaderTextSplitter
|
||||
from langchain_core.exceptions import LangChainException
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.prompts import ChatPromptTemplate
|
||||
from langchain_core.runnables import RunnablePassthrough
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
# Unstructured commercial client imports
|
||||
from unstructured_client import UnstructuredClient
|
||||
from unstructured_client.models import shared
|
||||
from unstructured_client.models.errors import SDKError
|
||||
from pytube import YouTube
|
||||
|
||||
from common.extensions import db
|
||||
from common.models.document import DocumentVersion, Embedding
|
||||
@@ -80,6 +85,8 @@ def create_embeddings(tenant_id, document_version_id):
|
||||
process_pdf(tenant, model_variables, document_version)
|
||||
case 'html':
|
||||
process_html(tenant, model_variables, document_version)
|
||||
case 'youtube':
|
||||
process_youtube(tenant, model_variables, document_version)
|
||||
case _:
|
||||
raise Exception(f'No functionality defined for file type {document_version.file_type} '
|
||||
f'for tenant {tenant_id} '
|
||||
@@ -200,7 +207,7 @@ def process_html(tenant, model_variables, document_version):
|
||||
if len(chunks) > 1:
|
||||
summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
|
||||
document_version.system_context = (f'Title: {title}\n'
|
||||
f'Summary: {summary}\n')
|
||||
f'Summary: {summary}\n')
|
||||
else:
|
||||
document_version.system_context = (f'Title: {title}\n')
|
||||
|
||||
@@ -408,3 +415,178 @@ def combine_chunks(potential_chunks, min_chars, max_chars):
|
||||
actual_chunks.append(current_chunk)
|
||||
|
||||
return actual_chunks
|
||||
|
||||
|
||||
def process_youtube(tenant, model_variables, document_version):
|
||||
base_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
|
||||
document_version.file_location)
|
||||
# clean old files if necessary
|
||||
|
||||
of, title, description, author = download_youtube(document_version.url, base_path, 'downloaded.mp4', tenant)
|
||||
document_version.system_context = f'Title: {title}\nDescription: {description}\nAuthor: {author}'
|
||||
compress_audio(base_path, 'downloaded.mp4', 'compressed.mp3', tenant)
|
||||
transcribe_audio(base_path, 'compressed.mp3', 'transcription.txt', document_version.language, tenant, model_variables)
|
||||
annotate_transcription(base_path, 'transcription.txt', 'transcription.md', tenant, model_variables)
|
||||
|
||||
potential_chunks = create_potential_chunks_for_markdown(base_path, 'transcription.md', tenant)
|
||||
actual_chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
|
||||
model_variables['max_chunk_size'])
|
||||
enriched_chunks = enrich_chunks(tenant, document_version, actual_chunks)
|
||||
embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
|
||||
|
||||
try:
|
||||
db.session.add(document_version)
|
||||
document_version.processing_finished_at = dt.now(tz.utc)
|
||||
document_version.processing = False
|
||||
db.session.add_all(embeddings)
|
||||
db.session.commit()
|
||||
except SQLAlchemyError as e:
|
||||
current_app.logger.error(f'Error saving embedding information for tenant {tenant.id} '
|
||||
f'on Youtube document version {document_version.id}'
|
||||
f'error: {e}')
|
||||
raise
|
||||
|
||||
current_app.logger.info(f'Embeddings created successfully for tenant {tenant.id} '
|
||||
f'on Youtube document version {document_version.id} :-)')
|
||||
|
||||
|
||||
def download_youtube(url, file_location, file_name, tenant):
|
||||
try:
|
||||
current_app.logger.info(f'Downloading YouTube video: {url} on location {file_location} for tenant: {tenant.id}')
|
||||
yt = YouTube(url)
|
||||
stream = yt.streams.get_audio_only()
|
||||
output_file = stream.download(output_path=file_location, filename=file_name)
|
||||
current_app.logger.info(f'Downloaded YouTube video: {url} on location {file_location} for tenant: {tenant.id}')
|
||||
return output_file, yt.title, yt.description, yt.author
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Error downloading YouTube video: {url} on location {file_location} for '
|
||||
f'tenant: {tenant.id} with error: {e}')
|
||||
raise
|
||||
|
||||
|
||||
def compress_audio(file_location, input_file, output_file, tenant):
|
||||
try:
|
||||
current_app.logger.info(f'Compressing audio on {file_location} for tenant: {tenant.id}')
|
||||
result = os.popen(f'scripts/compress.sh -d {file_location} -i {input_file} -o {output_file}')
|
||||
output_file_path = os.path.join(file_location, output_file)
|
||||
count = 0
|
||||
while not os.path.exists(output_file_path) and count < 10:
|
||||
gevent.sleep(1)
|
||||
current_app.logger.debug(f'Waiting for {output_file_path} to be created... Count: {count}')
|
||||
count += 1
|
||||
current_app.logger.info(f'Compressed audio for {file_location} for tenant: {tenant.id}')
|
||||
return result
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Error compressing audio on {file_location} for tenant: {tenant.id} with error: {e}')
|
||||
raise
|
||||
|
||||
|
||||
def transcribe_audio(file_location, input_file, output_file, language, tenant, model_variables):
|
||||
try:
|
||||
current_app.logger.info(f'Transcribing audio on {file_location} for tenant: {tenant.id}')
|
||||
client = model_variables['transcription_client']
|
||||
model = model_variables['transcription_model']
|
||||
input_file_path = os.path.join(file_location, input_file)
|
||||
output_file_path = os.path.join(file_location, output_file)
|
||||
|
||||
count = 0
|
||||
while not os.path.exists(input_file_path) and count < 10:
|
||||
gevent.sleep(1)
|
||||
current_app.logger.debug(f'Waiting for {input_file_path} to exist... Count: {count}')
|
||||
count += 1
|
||||
|
||||
with open(input_file_path, 'rb') as audio_file:
|
||||
transcription = client.audio.transcriptions.create(
|
||||
file=audio_file,
|
||||
model=model,
|
||||
language=language,
|
||||
response_format='verbose_json',
|
||||
)
|
||||
|
||||
with open(output_file_path, 'w') as transcript_file:
|
||||
transcript_file.write(transcription.text)
|
||||
|
||||
current_app.logger.info(f'Transcribed audio for {file_location} for tenant: {tenant.id}')
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Error transcribing audio for {file_location} for tenant: {tenant.id}, '
|
||||
f'with error: {e}')
|
||||
raise
|
||||
|
||||
|
||||
def annotate_transcription(file_location, input_file, output_file, tenant, model_variables):
|
||||
try:
|
||||
current_app.logger.debug(f'Annotating transcription on {file_location} for tenant {tenant.id}')
|
||||
llm = model_variables['llm']
|
||||
|
||||
template = model_variables['transcript_template']
|
||||
transcript_prompt = ChatPromptTemplate.from_template(template)
|
||||
setup = RunnablePassthrough()
|
||||
output_parser = StrOutputParser()
|
||||
transcript = ''
|
||||
with open(os.path.join(file_location, input_file), 'r') as f:
|
||||
transcript = f.read()
|
||||
|
||||
chain = setup | transcript_prompt | llm | output_parser
|
||||
input_transcript = {"transcript": transcript}
|
||||
|
||||
annotated_transcript = chain.invoke(input_transcript)
|
||||
|
||||
with open(os.path.join(file_location, output_file), 'w') as f:
|
||||
f.write(annotated_transcript)
|
||||
|
||||
current_app.logger.info(f'Annotated transcription for {file_location} for tenant {tenant.id}')
|
||||
except Exception as e:
|
||||
current_app.logger.error(f'Error annotating transcription for {file_location} for tenant {tenant.id}, '
|
||||
f'with error: {e}')
|
||||
raise
|
||||
|
||||
|
||||
def create_potential_chunks_for_markdown(base_path, input_file, tenant):
|
||||
current_app.logger.info(f'Creating potential chunks for {base_path} for tenant {tenant.id}')
|
||||
markdown = ''
|
||||
with open(os.path.join(base_path, input_file), 'r') as f:
|
||||
markdown = f.read()
|
||||
|
||||
headers_to_split_on = [
|
||||
("#", "Header 1"),
|
||||
("##", "Header 2"),
|
||||
# ("###", "Header 3"),
|
||||
]
|
||||
|
||||
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
|
||||
md_header_splits = markdown_splitter.split_text(markdown)
|
||||
potential_chunks = [doc.page_content for doc in md_header_splits]
|
||||
|
||||
return potential_chunks
|
||||
|
||||
|
||||
def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):
|
||||
actual_chunks = []
|
||||
current_chunk = ""
|
||||
current_length = 0
|
||||
|
||||
for chunk in potential_chunks:
|
||||
chunk_length = len(chunk)
|
||||
|
||||
if current_length + chunk_length > max_chars:
|
||||
if current_length >= min_chars:
|
||||
actual_chunks.append(current_chunk)
|
||||
current_chunk = chunk
|
||||
current_length = chunk_length
|
||||
else:
|
||||
# If the combined chunk is still less than max_chars, keep adding
|
||||
current_chunk += f'\n{chunk}'
|
||||
current_length += chunk_length
|
||||
else:
|
||||
current_chunk += f'\n{chunk}'
|
||||
current_length += chunk_length
|
||||
|
||||
# Handle the last chunk
|
||||
if current_chunk and current_length >= 0:
|
||||
actual_chunks.append(current_chunk)
|
||||
|
||||
return actual_chunks
|
||||
pass
|
||||
|
||||
|
||||
|
||||
|
||||
BIN
nginx/.DS_Store
vendored
BIN
nginx/.DS_Store
vendored
Binary file not shown.
@@ -168,3 +168,5 @@ yarl==1.9.4
|
||||
zope.event==5.0
|
||||
zope.interface==6.3
|
||||
zxcvbn==4.4.28
|
||||
|
||||
pytube~=15.0.0
|
||||
57
scripts/compress.sh
Executable file
57
scripts/compress.sh
Executable file
@@ -0,0 +1,57 @@
|
||||
#!/bin/bash
|
||||
|
||||
while getopts d:i:o: flag
|
||||
do
|
||||
case "${flag}" in
|
||||
d) directory="${OPTARG}";;
|
||||
i) input_file="${OPTARG}";;
|
||||
o) output_file="${OPTARG}";;
|
||||
*) # Catch-all for unexpected arguments
|
||||
echo "Invalid option: -$OPTARG" >&2
|
||||
echo "Usage: ./compress.sh -d <audio_folder> -i <input_file> -o <output_file>"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Check if the directory is provided
|
||||
if [ -z "$directory" ]; then
|
||||
echo "Directory is required."
|
||||
echo "Usage: ./compress.sh -d <audio_folder> -i <input_file> -o <output_file>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "$input_file" ]; then
|
||||
echo "Input file is required."
|
||||
echo "Usage: ./compress.sh -d <audio_folder> -i <input_file> -o <output_file>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "$output_file" ]; then
|
||||
echo "Output file is required."
|
||||
echo "Usage: ./compress.sh -d <audio_folder> -i <input_file> -o <output_file>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cd "$directory" || exit 1
|
||||
|
||||
# Compress the file
|
||||
/usr/bin/ffmpeg -i "$input_file" -ar 16000 -ac 1 -map 0:a "$output_file"
|
||||
|
||||
WAIT_TIME=5
|
||||
|
||||
# Function to check for file existence
|
||||
check_file() {
|
||||
if [ -f "$output_file" ]; then
|
||||
echo "File $output_file is available."
|
||||
return 0
|
||||
else
|
||||
echo "File $output_file is not available yet. Waiting..."
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Wait for the file to become available
|
||||
while ! check_file; do
|
||||
sleep $WAIT_TIME
|
||||
done
|
||||
Reference in New Issue
Block a user