From 8e1dac0233e18b577d6425e51c43305bb1ab5e9d Mon Sep 17 00:00:00 2001 From: Josako Date: Thu, 4 Jul 2024 08:11:31 +0200 Subject: [PATCH] Youtube added - further checking required --- .DS_Store | Bin 12292 -> 12292 bytes .idea/eveAI.iml | 2 +- .idea/misc.xml | 2 +- common/utils/model_utils.py | 9 + config/config.py | 34 +++- docker/.DS_Store | Bin 10244 -> 10244 bytes docker/db/.DS_Store | Bin 6148 -> 6148 bytes docker/eveai_workers/Dockerfile | 3 + eveai_app/templates/document/add_url.html | 2 +- eveai_app/templates/document/add_youtube.html | 24 +++ eveai_app/templates/navbar.html | 1 + eveai_app/views/document_forms.py | 17 +- eveai_app/views/document_views.py | 58 +++++- eveai_workers/tasks.py | 186 +++++++++++++++++- nginx/.DS_Store | Bin 6148 -> 6148 bytes requirements.txt | 2 + scripts/compress.sh | 57 ++++++ 17 files changed, 386 insertions(+), 11 deletions(-) create mode 100644 eveai_app/templates/document/add_youtube.html create mode 100755 scripts/compress.sh diff --git a/.DS_Store b/.DS_Store index c678337017e65c8dc2c2c4e165e013466c307504..2bfdc62f9032fdd6f2a130c0c957fb03ef727c13 100644 GIT binary patch delta 700 zcmZokXi3=6C&;8;GxFsPas=h(mWWTDD6EEH zN^QO_e2kIl;HSwKMHMFR7v*N%{Px-^smY9D3X|W9^1^v~j2|`|i2Y(>ip-g;BPqoe z9U>1h%STcM#>|tHV?40ASF)K=&5N6Xfq|W&m?4>=h#`}qfT4t;I5*$LB`GIA2`Iy{ ze5F#dsMKULX$3YYGe%m8(P87o9+u7A3c`#^%!ZGr0~N9`=rN=N4Jbj@Tp(MXB`r1C zK~ahg%1ls{W=b@eT&p;_UL^&njTNXZlOcs6l_3#X<8DKTzpJH=Iijmbfx8R4ih>No z;N<+=0-!!1m;fU9fi@H{R092#!I008$Dqql2DGh+VROHd3nP$LlYBA9R)K}^T~|Dj+5EcCB0ZJ z&0Yd6Vgq_39~g|;AmfoeQ*w&Yy+R!988kHr??BW{&QzCUW>rbp+@=13b&$Dfa+i7) E0BAbBy#N3J delta 688 zcmZokXi3=6C&(lvHhF?z1=9ltASJe0LCBSnX-mlD1R;sZ^};-C?EBMJ@rzBKD6EEH zN^QO_e2kH4h1=wdq6(Aui*mEtNliO1G?`IMVe)%XUN}#W@xo>Uv0qF~K2s;_NJ_CC zzLO3z%STcM#>|tHW8AR0SF)K=&GiWb0|PrlF+(y#5kn?J0YeExac;hgOHxjL5>SRC zCueTGuh?WWX$3YDk=Y<~Vx*NA4K`lvVcE>BAk3)5bbHntph6Y~J%)6k0VT+qV;7b@ zFch2YpeV%#WhN*}GcgxTu2q~|Z>|f}#tPJy$&kX3%8-bxk-@d}^J=kUj_4{<;O;sm z8mwi2p`sweFgQ6sw*aUQ2o`_{exMBn43$8CWiaG3 - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 8b28aad..e8182a0 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,5 +3,5 @@ - + \ No newline at end of file diff --git a/common/utils/model_utils.py b/common/utils/model_utils.py index cdcb459..fcccc98 100644 --- a/common/utils/model_utils.py +++ b/common/utils/model_utils.py @@ -6,6 +6,7 @@ from langchain_core.pydantic_v1 import BaseModel, Field from langchain.prompts import ChatPromptTemplate import ast from typing import List +from openai import OpenAI from common.models.document import EmbeddingSmallOpenAI @@ -117,12 +118,14 @@ def select_model_variables(tenant): rag_template = current_app.config.get('GPT4_RAG_TEMPLATE') history_template = current_app.config.get('GPT4_HISTORY_TEMPLATE') encyclopedia_template = current_app.config.get('GPT4_ENCYCLOPEDIA_TEMPLATE') + transcript_template = current_app.config.get('GPT4_TRANSCRIPT_TEMPLATE') tool_calling_supported = True case 'gpt-3-5-turbo': summary_template = current_app.config.get('GPT3_5_SUMMARY_TEMPLATE') rag_template = current_app.config.get('GPT3_5_RAG_TEMPLATE') history_template = current_app.config.get('GPT3_5_HISTORY_TEMPLATE') encyclopedia_template = current_app.config.get('GPT3_5_ENCYCLOPEDIA_TEMPLATE') + transcript_template = current_app.config.get('GPT3_5_TRANSCRIPT_TEMPLATE') case _: raise Exception(f'Error setting model variables for tenant {tenant.id} ' f'error: Invalid chat model') @@ -130,12 +133,18 @@ def select_model_variables(tenant): model_variables['rag_template'] = rag_template model_variables['history_template'] = history_template model_variables['encyclopedia_template'] = encyclopedia_template + model_variables['transcript_template'] = transcript_template if tool_calling_supported: model_variables['cited_answer_cls'] = CitedAnswer case _: raise Exception(f'Error setting model variables for tenant {tenant.id} ' f'error: Invalid chat provider') + # Transcription Client Variables. Only Whisper-1 of OpenAI is currently supported + api_key = current_app.config.get('OPENAI_API_KEY') + model_variables['transcription_client'] = OpenAI(api_key=api_key) + model_variables['transcription_model'] = 'whisper-1' + return model_variables diff --git a/config/config.py b/config/config.py index b7462e3..90da352 100644 --- a/config/config.py +++ b/config/config.py @@ -58,7 +58,7 @@ class Config(object): SUPPORTED_LANGUAGES = ['en', 'fr', 'nl', 'de', 'es'] # supported LLMs - SUPPORTED_EMBEDDINGS = ['openai.text-embedding-3-small', 'mistral.mistral-embed'] + SUPPORTED_EMBEDDINGS = ['openai.text-embedding-3-small', 'openai.text-embedding-3-large', 'mistral.mistral-embed'] SUPPORTED_LLMS = ['openai.gpt-4o', 'openai.gpt-4-turbo', 'openai.gpt-3.5-turbo', 'mistral.mistral-large-2402'] # Celery settings @@ -123,6 +123,32 @@ class Config(object): Question: {question}""" + GPT3_5_ENCYCLOPEDIA_TEMPLATE = """You have a lot of background knowledge, and as such you are some kind of + 'encyclopedia' to explain general terminology. Only answer if you have a clear understanding of the question. + If not, say you do not have sufficient information to answer the question. Use the {language} in your communication. + Question: + {question}""" + + GPT4_TRANSCRIPT_TEMPLATE = """You are a transcription editor that improves a given transcript on several parts + and returns markdown. Without changing what people say. The transcript is delimited between triple backquotes. + Do the following: + - divide the transcript into several logical parts. Ensure questions and their answers are in the same logical part. + - annotate the text to identify these logical parts using headings (max 2 levels) in the same language as the transcript. + - improve errors in the transcript given the context, but leave the text intact. + + ```{transcript}``` + """ + + GPT3_5_TRANSCRIPT_TEMPLATE = """You are a transcription editor that improves a given transcript on several parts + and returns markdown. Without changing what people say. The transcript is delimited between triple backquotes. + Do the following: + - divide the transcript into several logical parts. Ensure questions and their answers are in the same logical part. + - annotate the text to identify these logical parts using headings (max 2 levels) in the same language as the transcript. + - improve errors in the transcript given the context, but leave the text intact. + + ```{transcript}``` + """ + # SocketIO settings # SOCKETIO_ASYNC_MODE = 'threading' SOCKETIO_ASYNC_MODE = 'gevent' @@ -182,6 +208,9 @@ class DevConfig(Config): # OpenAI API Keys OPENAI_API_KEY = 'sk-proj-8R0jWzwjL7PeoPyMhJTZT3BlbkFJLb6HfRB2Hr9cEVFWEhU7' + # Groq API Keys + GROQ_API_KEY = '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' + # Unstructured settings UNSTRUCTURED_API_KEY = 'pDgCrXumYhM3CNvjvwV8msMldXC3uw' UNSTRUCTURED_BASE_URL = 'https://flowitbv-16c4us0m.api.unstructuredapp.io' @@ -209,6 +238,9 @@ class DevConfig(Config): # Session settings SESSION_REDIS = redis.from_url('redis://redis:6379/2') + # PATH settings + ffmpeg_path = '/usr/bin/ffmpeg' + class ProdConfig(Config): DEVELOPMENT = False diff --git a/docker/.DS_Store b/docker/.DS_Store index b268a9d2041bb7764ff27642ae0210ac93f15bdc..ed9d9e0307617c5fbd47c885551cc1af6f93d925 100644 GIT binary patch delta 523 zcmZn(XbG6$pKU^hRb{$w73>605IN;wS;4UKgaOw5fZD+tO?mX=h}XJSZUNXkh! z3{K9^Enolw7S7mOAO#lB&3AE0%E?ax@;ORRF}hcXPo5wsCJ$jAbA%}?$Utake(bRM zsGuxMJ+he$J%`vpMl<9v$LlYBA9R)Mf$?}47K(Bi-FWSFxvZ92H z2(oLuPh~AW33Lrq0^tl~34Rp&mlRA^kdWOhAoZPfV#BM=>5RK3FA*r@ma48cGBMCmFflWqtSBhU*fZH)P~Nzg zfq{XEA%!6+C*3eOIX|}m$Y)@BXVCx?&&_vnNy^Dj0`fV|TuoZVFE)9ipqM;_dCU=} ztRMqP`{DeeLpnn-vXu-%CmDfe z!R3&xZI4b_#XngA@+R)tGOh>`c$iT9; zmXkwNS>HM+K07BjFTWS)J|JLZgwPDUP#Q+{O}@Y)G5G)s4>Jp=>E!b)svL85)F!T$ zI_3zJkP=F9f%*|ySYomRD-R3vqYn%q%{*nnMR_^-dFdc)CI_%u44X6N|J4*>OBp#J~> delta 181 zcmZoMXfc@J&&azmU^g=(?_?g9EtBW7@-Q>K6P>({Rh47mPG1v0v15*t*Rx6qrMM*J zvOFdz#{OqOTkVPRT4fnjnmn}vExadJ*letr&66$2|n5ko3N3PUDCaZb8naB_Zb z0Z5pEfvaV50lO4i=ONZ7E5s&GV3$$7#45Ch6KD@m5=mD<2C^=JjfLMCH?wp6 +
{{ form.hidden_tag() }} {% set disabled_fields = [] %} {% set exclude_fields = [] %} diff --git a/eveai_app/templates/document/add_youtube.html b/eveai_app/templates/document/add_youtube.html new file mode 100644 index 0000000..94d174e --- /dev/null +++ b/eveai_app/templates/document/add_youtube.html @@ -0,0 +1,24 @@ +{% extends 'base.html' %} +{% from "macros.html" import render_field %} + +{% block title %}Add Youtube Document{% endblock %} + +{% block content_title %}Add Youtube Document{% endblock %} +{% block content_description %}Add a youtube url and the corresponding document to EveAI. In some cases, url's cannot be loaded directly. Download the html and add it as a document in that case.{% endblock %} + +{% block content %} + + {{ form.hidden_tag() }} + {% set disabled_fields = [] %} + {% set exclude_fields = [] %} + {% for field in form %} + {{ render_field(field, disabled_fields, exclude_fields) }} + {% endfor %} + +
+{% endblock %} + + +{% block content_footer %} + +{% endblock %} \ No newline at end of file diff --git a/eveai_app/templates/navbar.html b/eveai_app/templates/navbar.html index 9dd4c7c..ffe6887 100644 --- a/eveai_app/templates/navbar.html +++ b/eveai_app/templates/navbar.html @@ -83,6 +83,7 @@ {{ dropdown('Document Mgmt', 'contacts', [ {'name': 'Add Document', 'url': '/document/add_document', 'roles': ['Super User', 'Tenant Admin']}, {'name': 'Add URL', 'url': '/document/add_url', 'roles': ['Super User', 'Tenant Admin']}, + {'name': 'Add Youtube Document' , 'url': '/document/add_youtube', 'roles': ['Super User', 'Tenant Admin']}, {'name': 'All Documents', 'url': '/document/documents', 'roles': ['Super User', 'Tenant Admin']}, {'name': 'Library Operations', 'url': '/document/library_operations', 'roles': ['Super User', 'Tenant Admin']}, ]) }} diff --git a/eveai_app/views/document_forms.py b/eveai_app/views/document_forms.py index eb52e7a..30de07a 100644 --- a/eveai_app/views/document_forms.py +++ b/eveai_app/views/document_forms.py @@ -20,7 +20,6 @@ class AddDocumentForm(FlaskForm): super().__init__() self.language.choices = [(language, language) for language in session.get('tenant').get('allowed_languages')] - self.language.data = session.get('default_language') class AddURLForm(FlaskForm): @@ -36,7 +35,21 @@ class AddURLForm(FlaskForm): super().__init__() self.language.choices = [(language, language) for language in session.get('tenant').get('allowed_languages')] - self.language.data = session.get('default_language') + + +class AddYoutubeForm(FlaskForm): + url = URLField('Youtube URL', validators=[DataRequired(), URL()]) + name = StringField('Name', validators=[Length(max=100)]) + language = SelectField('Language', choices=[], validators=[Optional()]) + user_context = TextAreaField('User Context', validators=[Optional()]) + valid_from = DateField('Valid from', id='form-control datepicker', validators=[Optional()]) + + submit = SubmitField('Submit') + + def __init__(self): + super().__init__() + self.language.choices = [(language, language) for language in + session.get('tenant').get('allowed_languages')] class EditDocumentForm(FlaskForm): diff --git a/eveai_app/views/document_views.py b/eveai_app/views/document_views.py index bd42f00..58af1e5 100644 --- a/eveai_app/views/document_views.py +++ b/eveai_app/views/document_views.py @@ -17,7 +17,7 @@ import io from common.models.document import Document, DocumentVersion from common.extensions import db -from .document_forms import AddDocumentForm, AddURLForm, EditDocumentForm, EditDocumentVersionForm +from .document_forms import AddDocumentForm, AddURLForm, EditDocumentForm, EditDocumentVersionForm, AddYoutubeForm from common.utils.middleware import mw_before_request from common.utils.celery_utils import current_celery from common.utils.nginx_utils import prefixed_url_for @@ -88,7 +88,7 @@ def add_url(): # If the form is submitted if form.validate_on_submit(): - current_app.logger.info(f'Adding document for tenant {session["tenant"]["id"]}') + current_app.logger.info(f'Adding url for tenant {session["tenant"]["id"]}') url = form.url.data doc_vers = DocumentVersion.query.filter_by(url=url).all() @@ -129,6 +129,50 @@ def add_url(): return render_template('document/add_url.html', form=form) +@document_bp.route('/add_youtube', methods=['GET', 'POST']) +@roles_accepted('Super User', 'Tenant Admin') +def add_youtube(): + form = AddYoutubeForm() + + if form.validate_on_submit(): + current_app.logger.info(f'Adding Youtube document for tenant {session["tenant"]["id"]}') + url = form.url.data + current_app.logger.debug(f'Value of language field: {form.language.data}') + + doc_vers = DocumentVersion.query.filter_by(url=url).all() + if doc_vers: + current_app.logger.info(f'A document with url {url} already exists. No new document created.') + flash(f'A document with url {url} already exists. No new document created.', 'info') + return redirect(prefixed_url_for('document_bp.documents')) + # As downloading a Youtube document can take quite some time, we offload this downloading to the worker + # We just pass a simple file to get things conform + file = "Youtube placeholder file" + + filename = 'placeholder.youtube' + extension = 'youtube' + form_dict = form_to_dict(form) + current_app.logger.debug(f'Form data: {form_dict}') + + new_doc, new_doc_vers = create_document_stack(form_dict, file, filename, extension) + + task = current_celery.send_task('create_embeddings', queue='embeddings', args=[ + session['tenant']['id'], + new_doc_vers.id, + ]) + current_app.logger.info(f'Processing and Embedding on Youtube document started for tenant ' + f'{session["tenant"]["id"]}, ' + f'Document Version {new_doc_vers.id}. ' + f'Processing and Embedding Youtube task: {task.id}') + flash(f'Processing on Youtube document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task.id}.', + 'success') + + return redirect(prefixed_url_for('document_bp.documents')) + else: + form_validation_failed(request, form) + + return render_template('document/add_youtube.html', form=form) + + @document_bp.route('/documents', methods=['GET', 'POST']) @roles_accepted('Super User', 'Tenant Admin') def documents(): @@ -381,7 +425,11 @@ def create_document_stack(form, file, filename, extension): new_doc = create_document(form, filename) # Create the DocumentVersion - new_doc_vers = create_version_for_document(new_doc, form.get('url', ''), form['language'], form['user_context']) + new_doc_vers = create_version_for_document(new_doc, + form.get('url', ''), + form.get('language', 'en'), + form.get('user_context', '') + ) try: db.session.add(new_doc) @@ -462,6 +510,10 @@ def upload_file_for_version(doc_vers, file, extension): # Example: write content to a file manually with open(os.path.join(upload_path, doc_vers.file_name), 'wb') as f: f.write(file.getvalue()) + elif isinstance(file, str): + # It's a string, handle accordingly + with open(os.path.join(upload_path, doc_vers.file_name), 'w') as f: + f.write(file) else: raise TypeError('Unsupported file type.') diff --git a/eveai_workers/tasks.py b/eveai_workers/tasks.py index 8905a34..d3c90d7 100644 --- a/eveai_workers/tasks.py +++ b/eveai_workers/tasks.py @@ -1,19 +1,24 @@ import os from datetime import datetime as dt, timezone as tz + +import gevent from bs4 import BeautifulSoup import html from celery import states from flask import current_app # OpenAI imports from langchain.chains.summarize import load_summarize_chain -from langchain.text_splitter import CharacterTextSplitter +from langchain.text_splitter import CharacterTextSplitter, MarkdownHeaderTextSplitter from langchain_core.exceptions import LangChainException +from langchain_core.output_parsers import StrOutputParser from langchain_core.prompts import ChatPromptTemplate +from langchain_core.runnables import RunnablePassthrough from sqlalchemy.exc import SQLAlchemyError # Unstructured commercial client imports from unstructured_client import UnstructuredClient from unstructured_client.models import shared from unstructured_client.models.errors import SDKError +from pytube import YouTube from common.extensions import db from common.models.document import DocumentVersion, Embedding @@ -80,6 +85,8 @@ def create_embeddings(tenant_id, document_version_id): process_pdf(tenant, model_variables, document_version) case 'html': process_html(tenant, model_variables, document_version) + case 'youtube': + process_youtube(tenant, model_variables, document_version) case _: raise Exception(f'No functionality defined for file type {document_version.file_type} ' f'for tenant {tenant_id} ' @@ -200,7 +207,7 @@ def process_html(tenant, model_variables, document_version): if len(chunks) > 1: summary = summarize_chunk(tenant, model_variables, document_version, chunks[0]) document_version.system_context = (f'Title: {title}\n' - f'Summary: {summary}\n') + f'Summary: {summary}\n') else: document_version.system_context = (f'Title: {title}\n') @@ -408,3 +415,178 @@ def combine_chunks(potential_chunks, min_chars, max_chars): actual_chunks.append(current_chunk) return actual_chunks + + +def process_youtube(tenant, model_variables, document_version): + base_path = os.path.join(current_app.config['UPLOAD_FOLDER'], + document_version.file_location) + # clean old files if necessary + + of, title, description, author = download_youtube(document_version.url, base_path, 'downloaded.mp4', tenant) + document_version.system_context = f'Title: {title}\nDescription: {description}\nAuthor: {author}' + compress_audio(base_path, 'downloaded.mp4', 'compressed.mp3', tenant) + transcribe_audio(base_path, 'compressed.mp3', 'transcription.txt', document_version.language, tenant, model_variables) + annotate_transcription(base_path, 'transcription.txt', 'transcription.md', tenant, model_variables) + + potential_chunks = create_potential_chunks_for_markdown(base_path, 'transcription.md', tenant) + actual_chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'], + model_variables['max_chunk_size']) + enriched_chunks = enrich_chunks(tenant, document_version, actual_chunks) + embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks) + + try: + db.session.add(document_version) + document_version.processing_finished_at = dt.now(tz.utc) + document_version.processing = False + db.session.add_all(embeddings) + db.session.commit() + except SQLAlchemyError as e: + current_app.logger.error(f'Error saving embedding information for tenant {tenant.id} ' + f'on Youtube document version {document_version.id}' + f'error: {e}') + raise + + current_app.logger.info(f'Embeddings created successfully for tenant {tenant.id} ' + f'on Youtube document version {document_version.id} :-)') + + +def download_youtube(url, file_location, file_name, tenant): + try: + current_app.logger.info(f'Downloading YouTube video: {url} on location {file_location} for tenant: {tenant.id}') + yt = YouTube(url) + stream = yt.streams.get_audio_only() + output_file = stream.download(output_path=file_location, filename=file_name) + current_app.logger.info(f'Downloaded YouTube video: {url} on location {file_location} for tenant: {tenant.id}') + return output_file, yt.title, yt.description, yt.author + except Exception as e: + current_app.logger.error(f'Error downloading YouTube video: {url} on location {file_location} for ' + f'tenant: {tenant.id} with error: {e}') + raise + + +def compress_audio(file_location, input_file, output_file, tenant): + try: + current_app.logger.info(f'Compressing audio on {file_location} for tenant: {tenant.id}') + result = os.popen(f'scripts/compress.sh -d {file_location} -i {input_file} -o {output_file}') + output_file_path = os.path.join(file_location, output_file) + count = 0 + while not os.path.exists(output_file_path) and count < 10: + gevent.sleep(1) + current_app.logger.debug(f'Waiting for {output_file_path} to be created... Count: {count}') + count += 1 + current_app.logger.info(f'Compressed audio for {file_location} for tenant: {tenant.id}') + return result + except Exception as e: + current_app.logger.error(f'Error compressing audio on {file_location} for tenant: {tenant.id} with error: {e}') + raise + + +def transcribe_audio(file_location, input_file, output_file, language, tenant, model_variables): + try: + current_app.logger.info(f'Transcribing audio on {file_location} for tenant: {tenant.id}') + client = model_variables['transcription_client'] + model = model_variables['transcription_model'] + input_file_path = os.path.join(file_location, input_file) + output_file_path = os.path.join(file_location, output_file) + + count = 0 + while not os.path.exists(input_file_path) and count < 10: + gevent.sleep(1) + current_app.logger.debug(f'Waiting for {input_file_path} to exist... Count: {count}') + count += 1 + + with open(input_file_path, 'rb') as audio_file: + transcription = client.audio.transcriptions.create( + file=audio_file, + model=model, + language=language, + response_format='verbose_json', + ) + + with open(output_file_path, 'w') as transcript_file: + transcript_file.write(transcription.text) + + current_app.logger.info(f'Transcribed audio for {file_location} for tenant: {tenant.id}') + except Exception as e: + current_app.logger.error(f'Error transcribing audio for {file_location} for tenant: {tenant.id}, ' + f'with error: {e}') + raise + + +def annotate_transcription(file_location, input_file, output_file, tenant, model_variables): + try: + current_app.logger.debug(f'Annotating transcription on {file_location} for tenant {tenant.id}') + llm = model_variables['llm'] + + template = model_variables['transcript_template'] + transcript_prompt = ChatPromptTemplate.from_template(template) + setup = RunnablePassthrough() + output_parser = StrOutputParser() + transcript = '' + with open(os.path.join(file_location, input_file), 'r') as f: + transcript = f.read() + + chain = setup | transcript_prompt | llm | output_parser + input_transcript = {"transcript": transcript} + + annotated_transcript = chain.invoke(input_transcript) + + with open(os.path.join(file_location, output_file), 'w') as f: + f.write(annotated_transcript) + + current_app.logger.info(f'Annotated transcription for {file_location} for tenant {tenant.id}') + except Exception as e: + current_app.logger.error(f'Error annotating transcription for {file_location} for tenant {tenant.id}, ' + f'with error: {e}') + raise + + +def create_potential_chunks_for_markdown(base_path, input_file, tenant): + current_app.logger.info(f'Creating potential chunks for {base_path} for tenant {tenant.id}') + markdown = '' + with open(os.path.join(base_path, input_file), 'r') as f: + markdown = f.read() + + headers_to_split_on = [ + ("#", "Header 1"), + ("##", "Header 2"), + # ("###", "Header 3"), + ] + + markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False) + md_header_splits = markdown_splitter.split_text(markdown) + potential_chunks = [doc.page_content for doc in md_header_splits] + + return potential_chunks + + +def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars): + actual_chunks = [] + current_chunk = "" + current_length = 0 + + for chunk in potential_chunks: + chunk_length = len(chunk) + + if current_length + chunk_length > max_chars: + if current_length >= min_chars: + actual_chunks.append(current_chunk) + current_chunk = chunk + current_length = chunk_length + else: + # If the combined chunk is still less than max_chars, keep adding + current_chunk += f'\n{chunk}' + current_length += chunk_length + else: + current_chunk += f'\n{chunk}' + current_length += chunk_length + + # Handle the last chunk + if current_chunk and current_length >= 0: + actual_chunks.append(current_chunk) + + return actual_chunks + pass + + + diff --git a/nginx/.DS_Store b/nginx/.DS_Store index 96e4f843f52768587e9cf6fc94d645cd47de5735..62ddc6cd98ad4ad96a07f1b00b6208c4aba05af6 100644 GIT binary patch delta 25 gcmZoMXffE3!^R}|d2$|G8l%C+LL>Pjj0dK&2 + echo "Usage: ./compress.sh -d -i -o " + exit 1 + ;; + esac +done + +# Check if the directory is provided +if [ -z "$directory" ]; then + echo "Directory is required." + echo "Usage: ./compress.sh -d -i -o " + exit 1 +fi + +if [ -z "$input_file" ]; then + echo "Input file is required." + echo "Usage: ./compress.sh -d -i -o " + exit 1 +fi + +if [ -z "$output_file" ]; then + echo "Output file is required." + echo "Usage: ./compress.sh -d -i -o " + exit 1 +fi + +cd "$directory" || exit 1 + +# Compress the file +/usr/bin/ffmpeg -i "$input_file" -ar 16000 -ac 1 -map 0:a "$output_file" + +WAIT_TIME=5 + +# Function to check for file existence +check_file() { + if [ -f "$output_file" ]; then + echo "File $output_file is available." + return 0 + else + echo "File $output_file is not available yet. Waiting..." + return 1 + fi +} + +# Wait for the file to become available +while ! check_file; do + sleep $WAIT_TIME +done