From 8e1dac0233e18b577d6425e51c43305bb1ab5e9d Mon Sep 17 00:00:00 2001
From: Josako <pieter.laroy@flow-it.net>
Date: Thu, 4 Jul 2024 08:11:31 +0200
Subject: [PATCH] Youtube added - further checking required

---
 .DS_Store                                     | Bin 12292 -> 12292 bytes
 .idea/eveAI.iml                               |   2 +-
 .idea/misc.xml                                |   2 +-
 common/utils/model_utils.py                   |   9 +
 config/config.py                              |  34 +++-
 docker/.DS_Store                              | Bin 10244 -> 10244 bytes
 docker/db/.DS_Store                           | Bin 6148 -> 6148 bytes
 docker/eveai_workers/Dockerfile               |   3 +
 eveai_app/templates/document/add_url.html     |   2 +-
 eveai_app/templates/document/add_youtube.html |  24 +++
 eveai_app/templates/navbar.html               |   1 +
 eveai_app/views/document_forms.py             |  17 +-
 eveai_app/views/document_views.py             |  58 +++++-
 eveai_workers/tasks.py                        | 186 +++++++++++++++++-
 nginx/.DS_Store                               | Bin 6148 -> 6148 bytes
 requirements.txt                              |   2 +
 scripts/compress.sh                           |  57 ++++++
 17 files changed, 386 insertions(+), 11 deletions(-)
 create mode 100644 eveai_app/templates/document/add_youtube.html
 create mode 100755 scripts/compress.sh
diff --git a/.DS_Store b/.DS_Store
index c678337017e65c8dc2c2c4e165e013466c307504..2bfdc62f9032fdd6f2a130c0c957fb03ef727c13 100644
GIT binary patch
delta 700
zcmZokXi3=6C&;8;G<kwx1=9xxASJe0LCBSnY1`w;2|^N+>xFsPas=h(mWWTDD6EEH
zN^QO_e2kIl;HSwKMHMFR7v*N%{Px-^smY9D3X|W9^1^v~j2|`|i2Y(>ip-g;BPqoe
z9U>1h%STcM#>|tHV?40ASF)K=&5N6Xfq|W&m?4>=h#`}qfT4t;I5*$LB`GIA2`Iy{
ze5F#dsMKULX$3YYGe%m8(P87o9+u7A3c`#^%!ZGr0~N9`=rN=N4Jbj@Tp(MXB`r1C
zK~ahg%1ls{W=b@eT&p;_UL^&njTNXZlOcs6l_3#X<8DKTzpJH=Iijmbfx8R4ih>No
z;N<+=0-!!1m;fU9fi@H{R092#!I008$Dqql2DGh+VROHd3nP<y{pPF6OBtCO#3$FN
z%CH?V*0^RRHF=(@9E^ELbvh%*<T!<B0Zz_%0qN>$LlYBA9R)K}^T~|Dj+5EcCB0ZJ
z&0Yd6Vgq_39~g|;AmfoeQ*w&Yy+R!988kHr??BW{&QzCUW>rbp+@=13b&$Dfa+i7)
E0BAbBy#N3J

delta 688
zcmZokXi3=6C&(lvHhF?z1=9ltASJe0LCBSnX-mlD1R;sZ^};-C?EBMJ@rzBKD6EEH
zN^QO_e2kH4h1=wdq6(Aui*mEtNliO1G?`IMVe)%XUN}#W@xo>Uv0qF~K2s;_NJ_CC
zzLO3z%STcM#>|tHW8AR0SF)K=&GiWb0|PrlF+(y#5kn?J0YeExac;hgOHxjL5>SRC
zCueTGuh?WWX$3YDk=Y<~Vx*NA4K`lvVcE>BAk3)5bbHntph6Y~J%)6k0VT+qV;7b@
zFch2YpeV%#WhN*}GcgxTu2q~|Z>|f}#tPJy$&kX3%8-bxk-@d}^J=kUj_4{<;O;sm
z8mwi2p`sweFgQ6sw*aUQ2o`_{exMBn43$8CWiaG3<T2<nlmTrkV%XfT<ig0L8oc?c
z@=`{o@=ud%RAty!-rN%fbnrY?IX2xaZIEvdsZO69tB}AWU0rQtU|_7HU}|ADnNiqr
zGMl=j7qg2G$O<-~2l9adm<=)**&FXt1@!zNjzm*~@CZcB<V<xrW~M&{o4eFMunrp6
JOzu*z0sti?yix!F

diff --git a/.idea/eveAI.iml b/.idea/eveAI.iml
index bb6e42e..5d83c45 100644
--- a/.idea/eveAI.iml
+++ b/.idea/eveAI.iml
@@ -8,7 +8,7 @@
       <excludeFolder url="file://$MODULE_DIR$/.venv" />
       <excludeFolder url="file://$MODULE_DIR$/.venv2" />
     </content>
-    <orderEntry type="jdk" jdkName="Python 3.12 (eveAI)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.12 (eveai_dev)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="TemplatesService">
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 8b28aad..e8182a0 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -3,5 +3,5 @@
   <component name="Black">
     <option name="sdkName" value="Python 3.12 (eveAI)" />
   </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (eveAI)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (eveai_dev)" project-jdk-type="Python SDK" />
 </project>
\ No newline at end of file
diff --git a/common/utils/model_utils.py b/common/utils/model_utils.py
index cdcb459..fcccc98 100644
--- a/common/utils/model_utils.py
+++ b/common/utils/model_utils.py
@@ -6,6 +6,7 @@ from langchain_core.pydantic_v1 import BaseModel, Field
 from langchain.prompts import ChatPromptTemplate
 import ast
 from typing import List
+from openai import OpenAI
 
 from common.models.document import EmbeddingSmallOpenAI
 
@@ -117,12 +118,14 @@ def select_model_variables(tenant):
                     rag_template = current_app.config.get('GPT4_RAG_TEMPLATE')
                     history_template = current_app.config.get('GPT4_HISTORY_TEMPLATE')
                     encyclopedia_template = current_app.config.get('GPT4_ENCYCLOPEDIA_TEMPLATE')
+                    transcript_template = current_app.config.get('GPT4_TRANSCRIPT_TEMPLATE')
                     tool_calling_supported = True
                 case 'gpt-3-5-turbo':
                     summary_template = current_app.config.get('GPT3_5_SUMMARY_TEMPLATE')
                     rag_template = current_app.config.get('GPT3_5_RAG_TEMPLATE')
                     history_template = current_app.config.get('GPT3_5_HISTORY_TEMPLATE')
                     encyclopedia_template = current_app.config.get('GPT3_5_ENCYCLOPEDIA_TEMPLATE')
+                    transcript_template = current_app.config.get('GPT3_5_TRANSCRIPT_TEMPLATE')
                 case _:
                     raise Exception(f'Error setting model variables for tenant {tenant.id} '
                                     f'error: Invalid chat model')
@@ -130,12 +133,18 @@ def select_model_variables(tenant):
             model_variables['rag_template'] = rag_template
             model_variables['history_template'] = history_template
             model_variables['encyclopedia_template'] = encyclopedia_template
+            model_variables['transcript_template'] = transcript_template
             if tool_calling_supported:
                 model_variables['cited_answer_cls'] = CitedAnswer
         case _:
             raise Exception(f'Error setting model variables for tenant {tenant.id} '
                             f'error: Invalid chat provider')
 
+    # Transcription Client Variables. Only Whisper-1 of OpenAI is currently supported
+    api_key = current_app.config.get('OPENAI_API_KEY')
+    model_variables['transcription_client'] = OpenAI(api_key=api_key)
+    model_variables['transcription_model'] = 'whisper-1'
+
     return model_variables
 
 
diff --git a/config/config.py b/config/config.py
index b7462e3..90da352 100644
--- a/config/config.py
+++ b/config/config.py
@@ -58,7 +58,7 @@ class Config(object):
     SUPPORTED_LANGUAGES = ['en', 'fr', 'nl', 'de', 'es']
 
     # supported LLMs
-    SUPPORTED_EMBEDDINGS = ['openai.text-embedding-3-small', 'mistral.mistral-embed']
+    SUPPORTED_EMBEDDINGS = ['openai.text-embedding-3-small', 'openai.text-embedding-3-large', 'mistral.mistral-embed']
     SUPPORTED_LLMS = ['openai.gpt-4o', 'openai.gpt-4-turbo', 'openai.gpt-3.5-turbo', 'mistral.mistral-large-2402']
 
     # Celery settings
@@ -123,6 +123,32 @@ class Config(object):
     Question:
     {question}"""
 
+    GPT3_5_ENCYCLOPEDIA_TEMPLATE = """You have a lot of background knowledge, and as such you are some kind of 
+        'encyclopedia' to explain general terminology. Only answer if you have a clear understanding of the question. 
+        If not, say you do not have sufficient information to answer the question. Use the {language} in your communication.
+        Question:
+        {question}"""
+
+    GPT4_TRANSCRIPT_TEMPLATE = """You are a transcription editor that improves a given transcript on several parts 
+    and returns markdown. Without changing what people say. The transcript is delimited between triple backquotes.
+    Do the following:
+    - divide the transcript into several logical parts. Ensure questions and their answers are in the same logical part.
+    - annotate the text to identify these logical parts using headings (max 2 levels) in the same language as the transcript.
+    - improve errors in the transcript given the context, but leave the text intact.
+    
+    ```{transcript}``` 
+    """
+
+    GPT3_5_TRANSCRIPT_TEMPLATE = """You are a transcription editor that improves a given transcript on several parts 
+        and returns markdown. Without changing what people say. The transcript is delimited between triple backquotes.
+        Do the following:
+        - divide the transcript into several logical parts. Ensure questions and their answers are in the same logical part.
+        - annotate the text to identify these logical parts using headings (max 2 levels) in the same language as the transcript.
+        - improve errors in the transcript given the context, but leave the text intact.
+
+        ```{transcript}``` 
+        """
+
     # SocketIO settings
     # SOCKETIO_ASYNC_MODE = 'threading'
     SOCKETIO_ASYNC_MODE = 'gevent'
@@ -182,6 +208,9 @@ class DevConfig(Config):
     # OpenAI API Keys
     OPENAI_API_KEY = 'sk-proj-8R0jWzwjL7PeoPyMhJTZT3BlbkFJLb6HfRB2Hr9cEVFWEhU7'
 
+    # Groq API Keys
+    GROQ_API_KEY = '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'
+
     # Unstructured settings
     UNSTRUCTURED_API_KEY = 'pDgCrXumYhM3CNvjvwV8msMldXC3uw'
     UNSTRUCTURED_BASE_URL = 'https://flowitbv-16c4us0m.api.unstructuredapp.io'
@@ -209,6 +238,9 @@ class DevConfig(Config):
     # Session settings
     SESSION_REDIS = redis.from_url('redis://redis:6379/2')
 
+    # PATH settings
+    ffmpeg_path = '/usr/bin/ffmpeg'
+
 
 class ProdConfig(Config):
     DEVELOPMENT = False
diff --git a/docker/.DS_Store b/docker/.DS_Store
index b268a9d2041bb7764ff27642ae0210ac93f15bdc..ed9d9e0307617c5fbd47c885551cc1af6f93d925 100644
GIT binary patch
delta 523
zcmZn(XbG6$pKU^hRb{$w73>605IN;wS;4UKgaOw5fZD+tO?mX=h}XJSZUNXkh!
z3{K9^Enolw7S7mOAO#lB&3AE0%E?ax@;ORRF}hcXPo5wsCJ$jAbA%}?$Utake(bRM
zsGuxMJ+he$J%`vpMl<9v<TIo*6eF8^!La@MSuwC-Nb)Igd$GwQTgWi`1JGy&R)##F
z$(ca1!ZRm7Icc+xxEmuQ$0V_6eooGK0qN>$LlYBA9R)Mf$?}47K(Bi-FWSFxvZ92H
z2(oLuPh~AW33Lrq0^tl~34Rp&mlRA^kdWOhAoZPfV#BM=><YhFCW$^8JpKm&tlf=p

delta 474
zcmZn(XbG6$&uF+YU^hRb;bb0x>5RK3FA*r@ma48cGBMCmFflWqtSBhU*fZH)P~Nzg
zfq{XEA%!6+C*3eOIX|}m$Y)@BXVCx?&&_vnNy^Dj0`fV|TuoZVFE)9ipqM;_dCU=}
ztRMqP`{D<ij|s}MOzsw!oxD{bnbXL?z*tAY)WQsCv7;FaLk>eeLpnn-vXu-%CmDfe
z!R3&xZI4b_#Xng<OqvzKG7yuI7I^?vjZj_|T$GoSpO+34W8Ca77RiWd@8nPkHD;%;
z+LQCeWhT#(;Nhr0wlC06?3g21;1s&RF9}{o=gGO^a+}p8zq4*;SNO#;nMw5F0QqUM
Hzu0L2G8TXT

diff --git a/docker/db/.DS_Store b/docker/db/.DS_Store
index ed1ea8538dbd890459fddb3d4f7d6ef799bd6be9..6d4aeb694980faa4eb4b4d2cc21a7b7b01057509 100644
GIT binary patch
delta 572
zcmZoMXfc@J&nU1lU^g?Pz+@hlE%iy|#RW+@`AG~647-yGax#lc3=FO@GBLBTvaxfp
zb8vIS2501#2bUz4lomTB7Da=2A^G_^NicR|QdnkcdAxv#bADb)VrE`y5m-ZJN-9uE
zOn7EqN`ARheraAxF<5UfM1q5pgEL-0y4s`K(9qCWN5RD0xK>A@+R)tGOh>`c$iT9;
zmXkwNS>HM+K07BjFTWS)J|JLZgwPDUP#Q+{O}@Y)G5G)s4>Jp=>E!b)svL85)F!T$
zI_3zJkP=F9f%*|ySYomRD-R3vqYn%q%{*nnMR_^-dFdc)CI_%u44<Hx$ST8BK4tQH
xRw=gC8s2A%#V4O<m4PuovdYOcfP#^gp@<=sA%!86p%^(RCKgI>X6N|J4*>OBp#J~>

delta 181
zcmZoMXfc@J&&azmU^g=(?_?g9EtBW7@-Q>K6P>({Rh47mPG1v0v15*t*Rx6qrMM*J
z<R>vOFdz#{OqOTkVPRT4fnjnmn}vExadJ*letr&66$2|n5ko3N3PUDCaZb8naB_Zb
z0Z5pEfvaV50lO4i=ONZ7E5s&GV3$$7#45Ch6KD@m5=mD<2C^=JjfLMCH?wp6<p%(<
Ca5A6(

diff --git a/docker/eveai_workers/Dockerfile b/docker/eveai_workers/Dockerfile
index eaed8cc..45d7ffc 100644
--- a/docker/eveai_workers/Dockerfile
+++ b/docker/eveai_workers/Dockerfile
@@ -27,9 +27,12 @@ RUN apt-get update && apt-get install -y \
     build-essential \
     gcc \
     postgresql-client \
+    ffmpeg \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
 
+# Install Python dependencies.
+
 # Download dependencies as a separate step to take advantage of Docker's caching.
 # Leverage a cache mount to /root/.cache/pip to speed up subsequent builds.
 # Leverage a bind mount to requirements.txt to avoid having to copy them into
diff --git a/eveai_app/templates/document/add_url.html b/eveai_app/templates/document/add_url.html
index b90cca3..492fa92 100644
--- a/eveai_app/templates/document/add_url.html
+++ b/eveai_app/templates/document/add_url.html
@@ -7,7 +7,7 @@
 {% block content_description %}Add a url and the corresponding document to EveAI. In some cases, url's cannot be loaded directly. Download the html and add it as a document in that case.{% endblock %}
 
 {% block content %}
-    <form method="post" enctype="multipart/form-data">
+    <form method="post">
         {{ form.hidden_tag() }}
         {%  set disabled_fields = [] %}
         {%  set exclude_fields = [] %}
diff --git a/eveai_app/templates/document/add_youtube.html b/eveai_app/templates/document/add_youtube.html
new file mode 100644
index 0000000..94d174e
--- /dev/null
+++ b/eveai_app/templates/document/add_youtube.html
@@ -0,0 +1,24 @@
+{% extends 'base.html' %}
+{% from "macros.html" import render_field %}
+
+{% block title %}Add Youtube Document{% endblock %}
+
+{% block content_title %}Add Youtube Document{% endblock %}
+{% block content_description %}Add a youtube url and the corresponding document to EveAI. In some cases, url's cannot be loaded directly. Download the html and add it as a document in that case.{% endblock %}
+
+{% block content %}
+    <form method="post">
+        {{ form.hidden_tag() }}
+        {%  set disabled_fields = [] %}
+        {%  set exclude_fields = [] %}
+        {% for field in form %}
+            {{ render_field(field, disabled_fields, exclude_fields) }}
+        {% endfor %}
+        <button type="submit" class="btn btn-primary">Add Youtube Document</button>
+    </form>
+{% endblock %}
+
+
+{% block content_footer %}
+
+{% endblock %}
\ No newline at end of file
diff --git a/eveai_app/templates/navbar.html b/eveai_app/templates/navbar.html
index 9dd4c7c..ffe6887 100644
--- a/eveai_app/templates/navbar.html
+++ b/eveai_app/templates/navbar.html
@@ -83,6 +83,7 @@
                                 {{ dropdown('Document Mgmt', 'contacts', [
                                     {'name': 'Add Document', 'url': '/document/add_document', 'roles': ['Super User', 'Tenant Admin']},
                                     {'name': 'Add URL', 'url': '/document/add_url', 'roles': ['Super User', 'Tenant Admin']},
+                                    {'name': 'Add Youtube Document' , 'url': '/document/add_youtube', 'roles': ['Super User', 'Tenant Admin']},
                                     {'name': 'All Documents', 'url': '/document/documents', 'roles': ['Super User', 'Tenant Admin']},
                                     {'name': 'Library Operations', 'url': '/document/library_operations', 'roles': ['Super User', 'Tenant Admin']},
                                 ]) }}
diff --git a/eveai_app/views/document_forms.py b/eveai_app/views/document_forms.py
index eb52e7a..30de07a 100644
--- a/eveai_app/views/document_forms.py
+++ b/eveai_app/views/document_forms.py
@@ -20,7 +20,6 @@ class AddDocumentForm(FlaskForm):
         super().__init__()
         self.language.choices = [(language, language) for language in
                                  session.get('tenant').get('allowed_languages')]
-        self.language.data = session.get('default_language')
 
 
 class AddURLForm(FlaskForm):
@@ -36,7 +35,21 @@ class AddURLForm(FlaskForm):
         super().__init__()
         self.language.choices = [(language, language) for language in
                                  session.get('tenant').get('allowed_languages')]
-        self.language.data = session.get('default_language')
+
+
+class AddYoutubeForm(FlaskForm):
+    url = URLField('Youtube URL', validators=[DataRequired(), URL()])
+    name = StringField('Name', validators=[Length(max=100)])
+    language = SelectField('Language', choices=[], validators=[Optional()])
+    user_context = TextAreaField('User Context', validators=[Optional()])
+    valid_from = DateField('Valid from', id='form-control datepicker', validators=[Optional()])
+
+    submit = SubmitField('Submit')
+
+    def __init__(self):
+        super().__init__()
+        self.language.choices = [(language, language) for language in
+                                 session.get('tenant').get('allowed_languages')]
 
 
 class EditDocumentForm(FlaskForm):
diff --git a/eveai_app/views/document_views.py b/eveai_app/views/document_views.py
index bd42f00..58af1e5 100644
--- a/eveai_app/views/document_views.py
+++ b/eveai_app/views/document_views.py
@@ -17,7 +17,7 @@ import io
 
 from common.models.document import Document, DocumentVersion
 from common.extensions import db
-from .document_forms import AddDocumentForm, AddURLForm, EditDocumentForm, EditDocumentVersionForm
+from .document_forms import AddDocumentForm, AddURLForm, EditDocumentForm, EditDocumentVersionForm, AddYoutubeForm
 from common.utils.middleware import mw_before_request
 from common.utils.celery_utils import current_celery
 from common.utils.nginx_utils import prefixed_url_for
@@ -88,7 +88,7 @@ def add_url():
 
     # If the form is submitted
     if form.validate_on_submit():
-        current_app.logger.info(f'Adding document for tenant {session["tenant"]["id"]}')
+        current_app.logger.info(f'Adding url for tenant {session["tenant"]["id"]}')
         url = form.url.data
 
         doc_vers = DocumentVersion.query.filter_by(url=url).all()
@@ -129,6 +129,50 @@ def add_url():
     return render_template('document/add_url.html', form=form)
 
 
+@document_bp.route('/add_youtube', methods=['GET', 'POST'])
+@roles_accepted('Super User', 'Tenant Admin')
+def add_youtube():
+    form = AddYoutubeForm()
+
+    if form.validate_on_submit():
+        current_app.logger.info(f'Adding Youtube document for tenant {session["tenant"]["id"]}')
+        url = form.url.data
+        current_app.logger.debug(f'Value of language field: {form.language.data}')
+
+        doc_vers = DocumentVersion.query.filter_by(url=url).all()
+        if doc_vers:
+            current_app.logger.info(f'A document with url {url} already exists. No new document created.')
+            flash(f'A document with url {url} already exists. No new document created.', 'info')
+            return redirect(prefixed_url_for('document_bp.documents'))
+        # As downloading a Youtube document can take quite some time, we offload this downloading to the worker
+        # We just pass a simple file to get things conform
+        file = "Youtube placeholder file"
+
+        filename = 'placeholder.youtube'
+        extension = 'youtube'
+        form_dict = form_to_dict(form)
+        current_app.logger.debug(f'Form data: {form_dict}')
+
+        new_doc, new_doc_vers = create_document_stack(form_dict, file, filename, extension)
+
+        task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
+            session['tenant']['id'],
+            new_doc_vers.id,
+        ])
+        current_app.logger.info(f'Processing and Embedding on Youtube document started for tenant '
+                                f'{session["tenant"]["id"]}, '
+                                f'Document Version {new_doc_vers.id}. '
+                                f'Processing and Embedding Youtube task: {task.id}')
+        flash(f'Processing on Youtube document {new_doc.name}, version {new_doc_vers.id} started. Task ID: {task.id}.',
+              'success')
+
+        return redirect(prefixed_url_for('document_bp.documents'))
+    else:
+        form_validation_failed(request, form)
+
+    return render_template('document/add_youtube.html', form=form)
+
+
 @document_bp.route('/documents', methods=['GET', 'POST'])
 @roles_accepted('Super User', 'Tenant Admin')
 def documents():
@@ -381,7 +425,11 @@ def create_document_stack(form, file, filename, extension):
     new_doc = create_document(form, filename)
 
     # Create the DocumentVersion
-    new_doc_vers = create_version_for_document(new_doc, form.get('url', ''), form['language'], form['user_context'])
+    new_doc_vers = create_version_for_document(new_doc,
+                                               form.get('url', ''),
+                                               form.get('language', 'en'),
+                                               form.get('user_context', '')
+                                               )
 
     try:
         db.session.add(new_doc)
@@ -462,6 +510,10 @@ def upload_file_for_version(doc_vers, file, extension):
         # Example: write content to a file manually
         with open(os.path.join(upload_path, doc_vers.file_name), 'wb') as f:
             f.write(file.getvalue())
+    elif isinstance(file, str):
+        # It's a string, handle accordingly
+        with open(os.path.join(upload_path, doc_vers.file_name), 'w') as f:
+            f.write(file)
     else:
         raise TypeError('Unsupported file type.')
 
diff --git a/eveai_workers/tasks.py b/eveai_workers/tasks.py
index 8905a34..d3c90d7 100644
--- a/eveai_workers/tasks.py
+++ b/eveai_workers/tasks.py
@@ -1,19 +1,24 @@
 import os
 from datetime import datetime as dt, timezone as tz
+
+import gevent
 from bs4 import BeautifulSoup
 import html
 from celery import states
 from flask import current_app
 # OpenAI imports
 from langchain.chains.summarize import load_summarize_chain
-from langchain.text_splitter import CharacterTextSplitter
+from langchain.text_splitter import CharacterTextSplitter, MarkdownHeaderTextSplitter
 from langchain_core.exceptions import LangChainException
+from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import RunnablePassthrough
 from sqlalchemy.exc import SQLAlchemyError
 # Unstructured commercial client imports
 from unstructured_client import UnstructuredClient
 from unstructured_client.models import shared
 from unstructured_client.models.errors import SDKError
+from pytube import YouTube
 
 from common.extensions import db
 from common.models.document import DocumentVersion, Embedding
@@ -80,6 +85,8 @@ def create_embeddings(tenant_id, document_version_id):
                 process_pdf(tenant, model_variables, document_version)
             case 'html':
                 process_html(tenant, model_variables, document_version)
+            case 'youtube':
+                process_youtube(tenant, model_variables, document_version)
             case _:
                 raise Exception(f'No functionality defined for file type {document_version.file_type} '
                                 f'for tenant {tenant_id} '
@@ -200,7 +207,7 @@ def process_html(tenant, model_variables, document_version):
     if len(chunks) > 1:
         summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
         document_version.system_context = (f'Title: {title}\n'
-                                   f'Summary: {summary}\n')
+                                           f'Summary: {summary}\n')
     else:
         document_version.system_context = (f'Title: {title}\n')
 
@@ -408,3 +415,178 @@ def combine_chunks(potential_chunks, min_chars, max_chars):
         actual_chunks.append(current_chunk)
 
     return actual_chunks
+
+
+def process_youtube(tenant, model_variables, document_version):
+    base_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
+                             document_version.file_location)
+    # clean old files if necessary
+
+    of, title, description, author = download_youtube(document_version.url, base_path, 'downloaded.mp4', tenant)
+    document_version.system_context = f'Title: {title}\nDescription: {description}\nAuthor: {author}'
+    compress_audio(base_path, 'downloaded.mp4', 'compressed.mp3', tenant)
+    transcribe_audio(base_path, 'compressed.mp3', 'transcription.txt', document_version.language, tenant, model_variables)
+    annotate_transcription(base_path, 'transcription.txt', 'transcription.md', tenant, model_variables)
+
+    potential_chunks = create_potential_chunks_for_markdown(base_path, 'transcription.md', tenant)
+    actual_chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
+                                                model_variables['max_chunk_size'])
+    enriched_chunks = enrich_chunks(tenant, document_version, actual_chunks)
+    embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
+
+    try:
+        db.session.add(document_version)
+        document_version.processing_finished_at = dt.now(tz.utc)
+        document_version.processing = False
+        db.session.add_all(embeddings)
+        db.session.commit()
+    except SQLAlchemyError as e:
+        current_app.logger.error(f'Error saving embedding information for tenant {tenant.id} '
+                                 f'on Youtube document version {document_version.id}'
+                                 f'error: {e}')
+        raise
+
+    current_app.logger.info(f'Embeddings created successfully for tenant {tenant.id} '
+                            f'on Youtube document version {document_version.id} :-)')
+
+
+def download_youtube(url, file_location, file_name, tenant):
+    try:
+        current_app.logger.info(f'Downloading YouTube video: {url} on location {file_location} for tenant: {tenant.id}')
+        yt = YouTube(url)
+        stream = yt.streams.get_audio_only()
+        output_file = stream.download(output_path=file_location, filename=file_name)
+        current_app.logger.info(f'Downloaded YouTube video: {url} on location {file_location} for tenant: {tenant.id}')
+        return output_file, yt.title, yt.description, yt.author
+    except Exception as e:
+        current_app.logger.error(f'Error downloading YouTube video: {url} on location {file_location} for '
+                                 f'tenant: {tenant.id} with error: {e}')
+        raise
+
+
+def compress_audio(file_location, input_file, output_file, tenant):
+    try:
+        current_app.logger.info(f'Compressing audio on {file_location} for tenant: {tenant.id}')
+        result = os.popen(f'scripts/compress.sh -d {file_location} -i {input_file} -o {output_file}')
+        output_file_path = os.path.join(file_location, output_file)
+        count = 0
+        while not os.path.exists(output_file_path) and count < 10:
+            gevent.sleep(1)
+            current_app.logger.debug(f'Waiting for {output_file_path} to be created... Count: {count}')
+            count += 1
+        current_app.logger.info(f'Compressed audio for {file_location} for tenant: {tenant.id}')
+        return result
+    except Exception as e:
+        current_app.logger.error(f'Error compressing audio on {file_location} for tenant: {tenant.id} with error: {e}')
+        raise
+
+
+def transcribe_audio(file_location, input_file, output_file, language, tenant, model_variables):
+    try:
+        current_app.logger.info(f'Transcribing audio on {file_location} for tenant: {tenant.id}')
+        client = model_variables['transcription_client']
+        model = model_variables['transcription_model']
+        input_file_path = os.path.join(file_location, input_file)
+        output_file_path = os.path.join(file_location, output_file)
+
+        count = 0
+        while not os.path.exists(input_file_path) and count < 10:
+            gevent.sleep(1)
+            current_app.logger.debug(f'Waiting for {input_file_path} to exist... Count: {count}')
+            count += 1
+
+        with open(input_file_path, 'rb') as audio_file:
+            transcription = client.audio.transcriptions.create(
+                file=audio_file,
+                model=model,
+                language=language,
+                response_format='verbose_json',
+            )
+
+        with open(output_file_path, 'w') as transcript_file:
+            transcript_file.write(transcription.text)
+
+        current_app.logger.info(f'Transcribed audio for {file_location} for tenant: {tenant.id}')
+    except Exception as e:
+        current_app.logger.error(f'Error transcribing audio for {file_location} for tenant: {tenant.id}, '
+                                 f'with error: {e}')
+        raise
+
+
+def annotate_transcription(file_location, input_file, output_file, tenant, model_variables):
+    try:
+        current_app.logger.debug(f'Annotating transcription on {file_location} for tenant {tenant.id}')
+        llm = model_variables['llm']
+
+        template = model_variables['transcript_template']
+        transcript_prompt = ChatPromptTemplate.from_template(template)
+        setup = RunnablePassthrough()
+        output_parser = StrOutputParser()
+        transcript = ''
+        with open(os.path.join(file_location, input_file), 'r') as f:
+            transcript = f.read()
+
+        chain = setup | transcript_prompt | llm | output_parser
+        input_transcript = {"transcript": transcript}
+
+        annotated_transcript = chain.invoke(input_transcript)
+
+        with open(os.path.join(file_location, output_file), 'w') as f:
+            f.write(annotated_transcript)
+
+        current_app.logger.info(f'Annotated transcription for {file_location} for tenant {tenant.id}')
+    except Exception as e:
+        current_app.logger.error(f'Error annotating transcription for {file_location} for tenant {tenant.id}, '
+                                 f'with error: {e}')
+        raise
+
+
+def create_potential_chunks_for_markdown(base_path, input_file, tenant):
+    current_app.logger.info(f'Creating potential chunks for {base_path} for tenant {tenant.id}')
+    markdown = ''
+    with open(os.path.join(base_path, input_file), 'r') as f:
+        markdown = f.read()
+
+    headers_to_split_on = [
+        ("#", "Header 1"),
+        ("##", "Header 2"),
+        # ("###", "Header 3"),
+    ]
+
+    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
+    md_header_splits = markdown_splitter.split_text(markdown)
+    potential_chunks = [doc.page_content for doc in md_header_splits]
+
+    return potential_chunks
+
+
+def combine_chunks_for_markdown(potential_chunks, min_chars, max_chars):
+    actual_chunks = []
+    current_chunk = ""
+    current_length = 0
+
+    for chunk in potential_chunks:
+        chunk_length = len(chunk)
+
+        if current_length + chunk_length > max_chars:
+            if current_length >= min_chars:
+                actual_chunks.append(current_chunk)
+                current_chunk = chunk
+                current_length = chunk_length
+            else:
+                # If the combined chunk is still less than max_chars, keep adding
+                current_chunk += f'\n{chunk}'
+                current_length += chunk_length
+        else:
+            current_chunk += f'\n{chunk}'
+            current_length += chunk_length
+
+    # Handle the last chunk
+    if current_chunk and current_length >= 0:
+        actual_chunks.append(current_chunk)
+
+    return actual_chunks
+    pass
+
+
+
diff --git a/nginx/.DS_Store b/nginx/.DS_Store
index 96e4f843f52768587e9cf6fc94d645cd47de5735..62ddc6cd98ad4ad96a07f1b00b6208c4aba05af6 100644
GIT binary patch
delta 25
gcmZoMXffE3!^R}|d2$|G8l%C+LL<h_>>Pjj0dK<yQUCw|

delta 25
gcmZoMXffE3!^XsFFgcGcjZtA^p%LR|c8<UN0AeQxa{vGU

diff --git a/requirements.txt b/requirements.txt
index ffb0294..37aaea3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -168,3 +168,5 @@ yarl==1.9.4
 zope.event==5.0
 zope.interface==6.3
 zxcvbn==4.4.28
+
+pytube~=15.0.0
\ No newline at end of file
diff --git a/scripts/compress.sh b/scripts/compress.sh
new file mode 100755
index 0000000..9e6af54
--- /dev/null
+++ b/scripts/compress.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+while getopts d:i:o: flag
+do
+    case "${flag}" in
+        d) directory="${OPTARG}";;
+        i) input_file="${OPTARG}";;
+        o) output_file="${OPTARG}";;
+        *)  # Catch-all for unexpected arguments
+            echo "Invalid option: -$OPTARG" >&2
+            echo "Usage: ./compress.sh -d <audio_folder> -i <input_file> -o <output_file>"
+            exit 1
+            ;;
+    esac
+done
+
+# Check if the directory is provided
+if [ -z "$directory" ]; then
+    echo "Directory is required."
+    echo "Usage: ./compress.sh -d <audio_folder> -i <input_file> -o <output_file>"
+    exit 1
+fi
+
+if [ -z "$input_file" ]; then
+    echo "Input file is required."
+    echo "Usage: ./compress.sh -d <audio_folder> -i <input_file> -o <output_file>"
+    exit 1
+fi
+
+if [ -z "$output_file" ]; then
+    echo "Output file is required."
+    echo "Usage: ./compress.sh -d <audio_folder> -i <input_file> -o <output_file>"
+    exit 1
+fi
+
+cd "$directory" || exit 1
+
+# Compress the file
+/usr/bin/ffmpeg -i "$input_file" -ar 16000 -ac 1 -map 0:a "$output_file"
+
+WAIT_TIME=5
+
+# Function to check for file existence
+check_file() {
+  if [ -f "$output_file" ]; then
+    echo "File $output_file is available."
+    return 0
+  else
+    echo "File $output_file is not available yet. Waiting..."
+    return 1
+  fi
+}
+
+# Wait for the file to become available
+while ! check_file; do
+  sleep $WAIT_TIME
+done