Youtube added - further checking required

This commit is contained in:
Josako
2024-07-04 08:11:31 +02:00
parent 19e57f5adf
commit 8e1dac0233
17 changed files with 386 additions and 11 deletions

BIN
.DS_Store vendored

Binary file not shown.

2
.idea/eveAI.iml generated
View File

@@ -8,7 +8,7 @@
<excludeFolder url="file://$MODULE_DIR$/.venv" /> <excludeFolder url="file://$MODULE_DIR$/.venv" />
<excludeFolder url="file://$MODULE_DIR$/.venv2" /> <excludeFolder url="file://$MODULE_DIR$/.venv2" />
</content> </content>
<orderEntry type="jdk" jdkName="Python 3.12 (eveAI)" jdkType="Python SDK" /> <orderEntry type="jdk" jdkName="Python 3.12 (eveai_dev)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" /> <orderEntry type="sourceFolder" forTests="false" />
</component> </component>
<component name="TemplatesService"> <component name="TemplatesService">

2
.idea/misc.xml generated
View File

@@ -3,5 +3,5 @@
<component name="Black"> <component name="Black">
<option name="sdkName" value="Python 3.12 (eveAI)" /> <option name="sdkName" value="Python 3.12 (eveAI)" />
</component> </component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (eveAI)" project-jdk-type="Python SDK" /> <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (eveai_dev)" project-jdk-type="Python SDK" />
</project> </project>

View File

@@ -6,6 +6,7 @@ from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.prompts import ChatPromptTemplate from langchain.prompts import ChatPromptTemplate
import ast import ast
from typing import List from typing import List
from openai import OpenAI
from common.models.document import EmbeddingSmallOpenAI from common.models.document import EmbeddingSmallOpenAI
@@ -117,12 +118,14 @@ def select_model_variables(tenant):
rag_template = current_app.config.get('GPT4_RAG_TEMPLATE') rag_template = current_app.config.get('GPT4_RAG_TEMPLATE')
history_template = current_app.config.get('GPT4_HISTORY_TEMPLATE') history_template = current_app.config.get('GPT4_HISTORY_TEMPLATE')
encyclopedia_template = current_app.config.get('GPT4_ENCYCLOPEDIA_TEMPLATE') encyclopedia_template = current_app.config.get('GPT4_ENCYCLOPEDIA_TEMPLATE')
transcript_template = current_app.config.get('GPT4_TRANSCRIPT_TEMPLATE')
tool_calling_supported = True tool_calling_supported = True
case 'gpt-3-5-turbo': case 'gpt-3-5-turbo':
summary_template = current_app.config.get('GPT3_5_SUMMARY_TEMPLATE') summary_template = current_app.config.get('GPT3_5_SUMMARY_TEMPLATE')
rag_template = current_app.config.get('GPT3_5_RAG_TEMPLATE') rag_template = current_app.config.get('GPT3_5_RAG_TEMPLATE')
history_template = current_app.config.get('GPT3_5_HISTORY_TEMPLATE') history_template = current_app.config.get('GPT3_5_HISTORY_TEMPLATE')
encyclopedia_template = current_app.config.get('GPT3_5_ENCYCLOPEDIA_TEMPLATE') encyclopedia_template = current_app.config.get('GPT3_5_ENCYCLOPEDIA_TEMPLATE')
transcript_template = current_app.config.get('GPT3_5_TRANSCRIPT_TEMPLATE')
case _: case _:
raise Exception(f'Error setting model variables for tenant {tenant.id} ' raise Exception(f'Error setting model variables for tenant {tenant.id} '
f'error: Invalid chat model') f'error: Invalid chat model')
@@ -130,12 +133,18 @@ def select_model_variables(tenant):
model_variables['rag_template'] = rag_template model_variables['rag_template'] = rag_template
model_variables['history_template'] = history_template model_variables['history_template'] = history_template
model_variables['encyclopedia_template'] = encyclopedia_template model_variables['encyclopedia_template'] = encyclopedia_template
model_variables['transcript_template'] = transcript_template
if tool_calling_supported: if tool_calling_supported:
model_variables['cited_answer_cls'] = CitedAnswer model_variables['cited_answer_cls'] = CitedAnswer
case _: case _:
raise Exception(f'Error setting model variables for tenant {tenant.id} ' raise Exception(f'Error setting model variables for tenant {tenant.id} '
f'error: Invalid chat provider') f'error: Invalid chat provider')
# Transcription Client Variables. Only Whisper-1 of OpenAI is currently supported
api_key = current_app.config.get('OPENAI_API_KEY')
model_variables['transcription_client'] = OpenAI(api_key=api_key)
model_variables['transcription_model'] = 'whisper-1'
return model_variables return model_variables

View File

@@ -58,7 +58,7 @@ class Config(object):
SUPPORTED_LANGUAGES = ['en', 'fr', 'nl', 'de', 'es'] SUPPORTED_LANGUAGES = ['en', 'fr', 'nl', 'de', 'es']
# supported LLMs # supported LLMs
SUPPORTED_EMBEDDINGS = ['openai.text-embedding-3-small', 'mistral.mistral-embed'] SUPPORTED_EMBEDDINGS = ['openai.text-embedding-3-small', 'openai.text-embedding-3-large', 'mistral.mistral-embed']
SUPPORTED_LLMS = ['openai.gpt-4o', 'openai.gpt-4-turbo', 'openai.gpt-3.5-turbo', 'mistral.mistral-large-2402'] SUPPORTED_LLMS = ['openai.gpt-4o', 'openai.gpt-4-turbo', 'openai.gpt-3.5-turbo', 'mistral.mistral-large-2402']
# Celery settings # Celery settings
@@ -123,6 +123,32 @@ class Config(object):
Question: Question:
{question}""" {question}"""
GPT3_5_ENCYCLOPEDIA_TEMPLATE = """You have a lot of background knowledge, and as such you are some kind of
'encyclopedia' to explain general terminology. Only answer if you have a clear understanding of the question.
If not, say you do not have sufficient information to answer the question. Use the {language} in your communication.
Question:
{question}"""
GPT4_TRANSCRIPT_TEMPLATE = """You are a transcription editor that improves a given transcript on several parts
and returns markdown. Without changing what people say. The transcript is delimited between triple backquotes.
Do the following:
- divide the transcript into several logical parts. Ensure questions and their answers are in the same logical part.
- annotate the text to identify these logical parts using headings (max 2 levels) in the same language as the transcript.
- improve errors in the transcript given the context, but leave the text intact.
```{transcript}```
"""
GPT3_5_TRANSCRIPT_TEMPLATE = """You are a transcription editor that improves a given transcript on several parts
and returns markdown. Without changing what people say. The transcript is delimited between triple backquotes.
Do the following:
- divide the transcript into several logical parts. Ensure questions and their answers are in the same logical part.
- annotate the text to identify these logical parts using headings (max 2 levels) in the same language as the transcript.
- improve errors in the transcript given the context, but leave the text intact.
```{transcript}```
"""
# SocketIO settings # SocketIO settings
# SOCKETIO_ASYNC_MODE = 'threading' # SOCKETIO_ASYNC_MODE = 'threading'
SOCKETIO_ASYNC_MODE = 'gevent' SOCKETIO_ASYNC_MODE = 'gevent'
@@ -182,6 +208,9 @@ class DevConfig(Config):
# OpenAI API Keys # OpenAI API Keys
OPENAI_API_KEY = 'sk-proj-8R0jWzwjL7PeoPyMhJTZT3BlbkFJLb6HfRB2Hr9cEVFWEhU7' OPENAI_API_KEY = 'sk-proj-8R0jWzwjL7PeoPyMhJTZT3BlbkFJLb6HfRB2Hr9cEVFWEhU7'
# Groq API Keys
GROQ_API_KEY = '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'
# Unstructured settings # Unstructured settings
UNSTRUCTURED_API_KEY = 'pDgCrXumYhM3CNvjvwV8msMldXC3uw' UNSTRUCTURED_API_KEY = 'pDgCrXumYhM3CNvjvwV8msMldXC3uw'
UNSTRUCTURED_BASE_URL = 'https://flowitbv-16c4us0m.api.unstructuredapp.io' UNSTRUCTURED_BASE_URL = 'https://flowitbv-16c4us0m.api.unstructuredapp.io'
@@ -209,6 +238,9 @@ class DevConfig(Config):
# Session settings # Session settings
SESSION_REDIS = redis.from_url('redis://redis:6379/2') SESSION_REDIS = redis.from_url('redis://redis:6379/2')
# PATH settings
ffmpeg_path = '/usr/bin/ffmpeg'
class ProdConfig(Config): class ProdConfig(Config):
DEVELOPMENT = False DEVELOPMENT = False

BIN
docker/.DS_Store vendored

Binary file not shown.

BIN
docker/db/.DS_Store vendored

Binary file not shown.

View File

@@ -27,9 +27,12 @@ RUN apt-get update && apt-get install -y \
build-essential \ build-essential \
gcc \ gcc \
postgresql-client \ postgresql-client \
ffmpeg \
&& apt-get clean \ && apt-get clean \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
# Install Python dependencies.
# Download dependencies as a separate step to take advantage of Docker's caching. # Download dependencies as a separate step to take advantage of Docker's caching.
# Leverage a cache mount to /root/.cache/pip to speed up subsequent builds. # Leverage a cache mount to /root/.cache/pip to speed up subsequent builds.
# Leverage a bind mount to requirements.txt to avoid having to copy them into # Leverage a bind mount to requirements.txt to avoid having to copy them into

View File

@@ -7,7 +7,7 @@
{% block content_description %}Add a url and the corresponding document to EveAI. In some cases, url's cannot be loaded directly. Download the html and add it as a document in that case.{% endblock %} {% block content_description %}Add a url and the corresponding document to EveAI. In some cases, url's cannot be loaded directly. Download the html and add it as a document in that case.{% endblock %}
{% block content %} {% block content %}
<form method="post" enctype="multipart/form-data"> <form method="post">
{{ form.hidden_tag() }} {{ form.hidden_tag() }}
{% set disabled_fields = [] %} {% set disabled_fields = [] %}
{% set exclude_fields = [] %} {% set exclude_fields = [] %}

View File

@@ -0,0 +1,24 @@
{% extends 'base.html' %}
{% from "macros.html" import render_field %}
{% block title %}Add Youtube Document{% endblock %}
{% block content_title %}Add Youtube Document{% endblock %}
{% block content_description %}Add a youtube url and the corresponding document to EveAI. In some cases, url's cannot be loaded directly. Download the html and add it as a document in that case.{% endblock %}
{% block content %}
<form method="post">
{{ form.hidden_tag() }}
{% set disabled_fields = [] %}
{% set exclude_fields = [] %}
{% for field in form %}
{{ render_field(field, disabled_fields, exclude_fields) }}
{% endfor %}
<button type="submit" class="btn btn-primary">Add Youtube Document</button>
</form>
{% endblock %}
{% block content_footer %}
{% endblock %}

View File

@@ -83,6 +83,7 @@
{{ dropdown('Document Mgmt', 'contacts', [ {{ dropdown('Document Mgmt', 'contacts', [
{'name': 'Add Document', 'url': '/document/add_document', 'roles': ['Super User', 'Tenant Admin']}, {'name': 'Add Document', 'url': '/document/add_document', 'roles': ['Super User', 'Tenant Admin']},
{'name': 'Add URL', 'url': '/document/add_url', 'roles': ['Super User', 'Tenant Admin']}, {'name': 'Add URL', 'url': '/document/add_url', 'roles': ['Super User', 'Tenant Admin']},
{'name': 'Add Youtube Document' , 'url': '/document/add_youtube', 'roles': ['Super User', 'Tenant Admin']},
{'name': 'All Documents', 'url': '/document/documents', 'roles': ['Super User', 'Tenant Admin']}, {'name': 'All Documents', 'url': '/document/documents', 'roles': ['Super User', 'Tenant Admin']},
{'name': 'Library Operations', 'url': '/document/library_operations', 'roles': ['Super User', 'Tenant Admin']}, {'name': 'Library Operations', 'url': '/document/library_operations', 'roles': ['Super User', 'Tenant Admin']},
]) }} ]) }}

View File

@@ -20,7 +20,6 @@ class AddDocumentForm(FlaskForm):
super().__init__() super().__init__()
self.language.choices = [(language, language) for language in self.language.choices = [(language, language) for language in
session.get('tenant').get('allowed_languages')] session.get('tenant').get('allowed_languages')]
self.language.data = session.get('default_language')
class AddURLForm(FlaskForm): class AddURLForm(FlaskForm):
@@ -36,7 +35,21 @@ class AddURLForm(FlaskForm):
super().__init__() super().__init__()
self.language.choices = [(language, language) for language in self.language.choices = [(language, language) for language in
session.get('tenant').get('allowed_languages')] session.get('tenant').get('allowed_languages')]
self.language.data = session.get('default_language')
class AddYoutubeForm(FlaskForm):
url = URLField('Youtube URL', validators=[DataRequired(), URL()])
name = StringField('Name', validators=[Length(max=100)])
language = SelectField('Language', choices=[], validators=[Optional()])
user_context = TextAreaField('User Context', validators=[Optional()])
valid_from = DateField('Valid from', id='form-control datepicker', validators=[Optional()])
submit = SubmitField('Submit')
def __init__(self):
super().__init__()
self.language.choices = [(language, language) for language in
session.get('tenant').get('allowed_languages')]
class EditDocumentForm(FlaskForm): class EditDocumentForm(FlaskForm):

View File

@@ -17,7 +17,7 @@ import io
from common.models.document import Document, DocumentVersion from common.models.document import Document, DocumentVersion
from common.extensions import db from common.extensions import db
from .document_forms import AddDocumentForm, AddURLForm, EditDocumentForm, EditDocumentVersionForm from .document_forms import AddDocumentForm, AddURLForm, EditDocumentForm, EditDocumentVersionForm, AddYoutubeForm
from common.utils.middleware import mw_before_request from common.utils.middleware import mw_before_request
from common.utils.celery_utils import current_celery from common.utils.celery_utils import current_celery
from common.utils.nginx_utils import prefixed_url_for from common.utils.nginx_utils import prefixed_url_for
@@ -88,7 +88,7 @@ def add_url():
# If the form is submitted # If the form is submitted
if form.validate_on_submit(): if form.validate_on_submit():
current_app.logger.info(f'Adding document for tenant {session["tenant"]["id"]}') current_app.logger.info(f'Adding url for tenant {session["tenant"]["id"]}')
url = form.url.data url = form.url.data
doc_vers = DocumentVersion.query.filter_by(url=url).all() doc_vers = DocumentVersion.query.filter_by(url=url).all()
@@ -129,6 +129,50 @@ def add_url():
return render_template('document/add_url.html', form=form) return render_template('document/add_url.html', form=form)
@document_bp.route('/add_youtube', methods=['GET', 'POST'])
@roles_accepted('Super User', 'Tenant Admin')
def add_youtube():
form = AddYoutubeForm()
if form.validate_on_submit():
current_app.logger.info(f'Adding Youtube document for tenant {session["tenant"]["id"]}')
url = form.url.data
current_app.logger.debug(f'Value of language field: {form.language.data}')
doc_vers = DocumentVersion.query.filter_by(url=url).all()
if doc_vers:
current_app.logger.info(f'A document with url {url} already exists. No new document created.')
flash(f'A document with url {url} already exists. No new document created.', 'info')
return redirect(prefixed_url_for('document_bp.documents'))
# As downloading a Youtube document can take quite some time, we offload this downloading to the worker
# We just pass a simple file to get things conform
file = "Youtube placeholder file"
filename = 'placeholder.youtube'
extension = 'youtube'
form_dict = form_to_dict(form)
current_app.logger.debug(f'Form data: {form_dict}')
new_doc, new_doc_vers = create_document_stack(form_dict, file, filename, extension)
task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
session['tenant']['id'],
new_doc_vers.id,
])
current_app.logger.info(f'Processing and Embedding on Youtube document started for tenant '
f'{session["tenant"]["id"]}, '
f'Document Version {new_doc_vers.id}. '
f'Processing and Embedding Youtube task: {task.id}')
flash(f'Processing on Youtube document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task.id}.',
'success')
return redirect(prefixed_url_for('document_bp.documents'))
else:
form_validation_failed(request, form)
return render_template('document/add_youtube.html', form=form)
@document_bp.route('/documents', methods=['GET', 'POST']) @document_bp.route('/documents', methods=['GET', 'POST'])
@roles_accepted('Super User', 'Tenant Admin') @roles_accepted('Super User', 'Tenant Admin')
def documents(): def documents():
@@ -381,7 +425,11 @@ def create_document_stack(form, file, filename, extension):
new_doc = create_document(form, filename) new_doc = create_document(form, filename)
# Create the DocumentVersion # Create the DocumentVersion
new_doc_vers = create_version_for_document(new_doc, form.get('url', ''), form['language'], form['user_context']) new_doc_vers = create_version_for_document(new_doc,
form.get('url', ''),
form.get('language', 'en'),
form.get('user_context', '')
)
try: try:
db.session.add(new_doc) db.session.add(new_doc)
@@ -462,6 +510,10 @@ def upload_file_for_version(doc_vers, file, extension):
# Example: write content to a file manually # Example: write content to a file manually
with open(os.path.join(upload_path, doc_vers.file_name), 'wb') as f: with open(os.path.join(upload_path, doc_vers.file_name), 'wb') as f:
f.write(file.getvalue()) f.write(file.getvalue())
elif isinstance(file, str):
# It's a string, handle accordingly
with open(os.path.join(upload_path, doc_vers.file_name), 'w') as f:
f.write(file)
else: else:
raise TypeError('Unsupported file type.') raise TypeError('Unsupported file type.')

View File

@@ -1,19 +1,24 @@
import os import os
from datetime import datetime as dt, timezone as tz from datetime import datetime as dt, timezone as tz
import gevent
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import html import html
from celery import states from celery import states
from flask import current_app from flask import current_app
# OpenAI imports # OpenAI imports
from langchain.chains.summarize import load_summarize_chain from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import CharacterTextSplitter from langchain.text_splitter import CharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain_core.exceptions import LangChainException from langchain_core.exceptions import LangChainException
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.exc import SQLAlchemyError
# Unstructured commercial client imports # Unstructured commercial client imports
from unstructured_client import UnstructuredClient from unstructured_client import UnstructuredClient
from unstructured_client.models import shared from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError from unstructured_client.models.errors import SDKError
from pytube import YouTube
from common.extensions import db from common.extensions import db
from common.models.document import DocumentVersion, Embedding from common.models.document import DocumentVersion, Embedding
@@ -80,6 +85,8 @@ def create_embeddings(tenant_id, document_version_id):
process_pdf(tenant, model_variables, document_version) process_pdf(tenant, model_variables, document_version)
case 'html': case 'html':
process_html(tenant, model_variables, document_version) process_html(tenant, model_variables, document_version)
case 'youtube':
process_youtube(tenant, model_variables, document_version)
case _: case _:
raise Exception(f'No functionality defined for file type {document_version.file_type} ' raise Exception(f'No functionality defined for file type {document_version.file_type} '
f'for tenant {tenant_id} ' f'for tenant {tenant_id} '
@@ -200,7 +207,7 @@ def process_html(tenant, model_variables, document_version):
if len(chunks) > 1: if len(chunks) > 1:
summary = summarize_chunk(tenant, model_variables, document_version, chunks[0]) summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
document_version.system_context = (f'Title: {title}\n' document_version.system_context = (f'Title: {title}\n'
f'Summary: {summary}\n') f'Summary: {summary}\n')
else: else:
document_version.system_context = (f'Title: {title}\n') document_version.system_context = (f'Title: {title}\n')
@@ -408,3 +415,178 @@ def combine_chunks(potential_chunks, min_chars, max_chars):
actual_chunks.append(current_chunk) actual_chunks.append(current_chunk)
return actual_chunks return actual_chunks
def process_youtube(tenant, model_variables, document_version):
base_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
document_version.file_location)
# clean old files if necessary
of, title, description, author = download_youtube(document_version.url, base_path, 'downloaded.mp4', tenant)
document_version.system_context = f'Title: {title}\nDescription: {description}\nAuthor: {author}'
compress_audio(base_path, 'downloaded.mp4', 'compressed.mp3', tenant)
transcribe_audio(base_path, 'compressed.mp3', 'transcription.txt', document_version.language, tenant, model_variables)
annotate_transcription(base_path, 'transcription.txt', 'transcription.md', tenant, model_variables)
potential_chunks = create_potential_chunks_for_markdown(base_path, 'transcription.md', tenant)
actual_chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
model_variables['max_chunk_size'])
enriched_chunks = enrich_chunks(tenant, document_version, actual_chunks)
embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
try:
db.session.add(document_version)
document_version.processing_finished_at = dt.now(tz.utc)
document_version.processing = False
db.session.add_all(embeddings)
db.session.commit()
except SQLAlchemyError as e:
current_app.logger.error(f'Error saving embedding information for tenant {tenant.id} '
f'on Youtube document version {document_version.id}'
f'error: {e}')
raise
current_app.logger.info(f'Embeddings created successfully for tenant {tenant.id} '
f'on Youtube document version {document_version.id} :-)')
def download_youtube(url, file_location, file_name, tenant):
try:
current_app.logger.info(f'Downloading YouTube video: {url} on location {file_location} for tenant: {tenant.id}')
yt = YouTube(url)
stream = yt.streams.get_audio_only()
output_file = stream.download(output_path=file_location, filename=file_name)
current_app.logger.info(f'Downloaded YouTube video: {url} on location {file_location} for tenant: {tenant.id}')
return output_file, yt.title, yt.description, yt.author
except Exception as e:
current_app.logger.error(f'Error downloading YouTube video: {url} on location {file_location} for '
f'tenant: {tenant.id} with error: {e}')
raise
def compress_audio(file_location, input_file, output_file, tenant):
try:
current_app.logger.info(f'Compressing audio on {file_location} for tenant: {tenant.id}')
result = os.popen(f'scripts/compress.sh -d {file_location} -i {input_file} -o {output_file}')
output_file_path = os.path.join(file_location, output_file)
count = 0
while not os.path.exists(output_file_path) and count < 10:
gevent.sleep(1)
current_app.logger.debug(f'Waiting for {output_file_path} to be created... Count: {count}')
count += 1
current_app.logger.info(f'Compressed audio for {file_location} for tenant: {tenant.id}')
return result
except Exception as e:
current_app.logger.error(f'Error compressing audio on {file_location} for tenant: {tenant.id} with error: {e}')
raise
def transcribe_audio(file_location, input_file, output_file, language, tenant, model_variables):
try:
current_app.logger.info(f'Transcribing audio on {file_location} for tenant: {tenant.id}')
client = model_variables['transcription_client']
model = model_variables['transcription_model']
input_file_path = os.path.join(file_location, input_file)
output_file_path = os.path.join(file_location, output_file)
count = 0
while not os.path.exists(input_file_path) and count < 10:
gevent.sleep(1)
current_app.logger.debug(f'Waiting for {input_file_path} to exist... Count: {count}')
count += 1
with open(input_file_path, 'rb') as audio_file:
transcription = client.audio.transcriptions.create(
file=audio_file,
model=model,
language=language,
response_format='verbose_json',
)
with open(output_file_path, 'w') as transcript_file:
transcript_file.write(transcription.text)
current_app.logger.info(f'Transcribed audio for {file_location} for tenant: {tenant.id}')
except Exception as e:
current_app.logger.error(f'Error transcribing audio for {file_location} for tenant: {tenant.id}, '
f'with error: {e}')
raise
def annotate_transcription(file_location, input_file, output_file, tenant, model_variables):
try:
current_app.logger.debug(f'Annotating transcription on {file_location} for tenant {tenant.id}')
llm = model_variables['llm']
template = model_variables['transcript_template']
transcript_prompt = ChatPromptTemplate.from_template(template)
setup = RunnablePassthrough()
output_parser = StrOutputParser()
transcript = ''
with open(os.path.join(file_location, input_file), 'r') as f:
transcript = f.read()
chain = setup | transcript_prompt | llm | output_parser
input_transcript = {"transcript": transcript}
annotated_transcript = chain.invoke(input_transcript)
with open(os.path.join(file_location, output_file), 'w') as f:
f.write(annotated_transcript)
current_app.logger.info(f'Annotated transcription for {file_location} for tenant {tenant.id}')
except Exception as e:
current_app.logger.error(f'Error annotating transcription for {file_location} for tenant {tenant.id}, '
f'with error: {e}')
raise
def create_potential_chunks_for_markdown(base_path, input_file, tenant):
current_app.logger.info(f'Creating potential chunks for {base_path} for tenant {tenant.id}')
markdown = ''
with open(os.path.join(base_path, input_file), 'r') as f:
markdown = f.read()
headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
# ("###", "Header 3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
md_header_splits = markdown_splitter.split_text(markdown)
potential_chunks = [doc.page_content for doc in md_header_splits]
return potential_chunks
def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):
actual_chunks = []
current_chunk = ""
current_length = 0
for chunk in potential_chunks:
chunk_length = len(chunk)
if current_length + chunk_length > max_chars:
if current_length >= min_chars:
actual_chunks.append(current_chunk)
current_chunk = chunk
current_length = chunk_length
else:
# If the combined chunk is still less than max_chars, keep adding
current_chunk += f'\n{chunk}'
current_length += chunk_length
else:
current_chunk += f'\n{chunk}'
current_length += chunk_length
# Handle the last chunk
if current_chunk and current_length >= 0:
actual_chunks.append(current_chunk)
return actual_chunks
pass

BIN
nginx/.DS_Store vendored

Binary file not shown.

View File

@@ -168,3 +168,5 @@ yarl==1.9.4
zope.event==5.0 zope.event==5.0
zope.interface==6.3 zope.interface==6.3
zxcvbn==4.4.28 zxcvbn==4.4.28
pytube~=15.0.0

57
scripts/compress.sh Executable file
View File

@@ -0,0 +1,57 @@
#!/bin/bash
while getopts d:i:o: flag
do
case "${flag}" in
d) directory="${OPTARG}";;
i) input_file="${OPTARG}";;
o) output_file="${OPTARG}";;
*) # Catch-all for unexpected arguments
echo "Invalid option: -$OPTARG" >&2
echo "Usage: ./compress.sh -d <audio_folder> -i <input_file> -o <output_file>"
exit 1
;;
esac
done
# Check if the directory is provided
if [ -z "$directory" ]; then
echo "Directory is required."
echo "Usage: ./compress.sh -d <audio_folder> -i <input_file> -o <output_file>"
exit 1
fi
if [ -z "$input_file" ]; then
echo "Input file is required."
echo "Usage: ./compress.sh -d <audio_folder> -i <input_file> -o <output_file>"
exit 1
fi
if [ -z "$output_file" ]; then
echo "Output file is required."
echo "Usage: ./compress.sh -d <audio_folder> -i <input_file> -o <output_file>"
exit 1
fi
cd "$directory" || exit 1
# Compress the file
/usr/bin/ffmpeg -i "$input_file" -ar 16000 -ac 1 -map 0:a "$output_file"
WAIT_TIME=5
# Function to check for file existence
check_file() {
if [ -f "$output_file" ]; then
echo "File $output_file is available."
return 0
else
echo "File $output_file is not available yet. Waiting..."
return 1
fi
}
# Wait for the file to become available
while ! check_file; do
sleep $WAIT_TIME
done