- Adding a Tenant Type

- Allow filtering on Tenant Types & searching for parts of Tenant names - Implement health checks - Start Prometheus monitoring (needs to be finalized) - Refine audio_processor and srt_processor to reduce duplicate code and support for larger files - Introduce repopack to reason in LLMs about the code
2024-09-13 15:43:40 +02:00
parent 9e14824249
commit 6cf660e622
41 changed files with 687 additions and 579 deletions
--- a/config/config.py
+++ b/config/config.py
@@ -139,7 +139,7 @@ class Config(object):

    SUPPORTED_FILE_TYPES = ['pdf', 'html', 'md', 'txt', 'mp3', 'mp4', 'ogg', 'srt']

-
+    TENANT_TYPES = ['Active', 'Demo', 'Inactive', 'Test']


 class DevConfig(Config):
--- a/config/gc_sa_eveai.json
+++ b/config/gc_sa_eveai.json
@@ -1,13 +0,0 @@
-{
-  "type": "service_account",
-  "project_id": "eveai-420711",
-  "private_key_id": "e666408e75793321a6134243628346722a71b3a6",
-  "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQCaGTXCWpq08YD1\nOW4z+gncOlB7T/EIiEwsZgMp6pyUrNioGfiI9YN+uVR0nsUSmFf1YyerRgX7RqD5\nRc7T/OuX8iIvmloK3g7CaFezcVrjnBKcg/QsjDAt/OO3DTk4vykDlh/Kqxx73Jdv\nFH9YSV2H7ToWqIE8CTDnqe8vQS7Bq995c9fPlues31MgndRFg3CFkH0ldfZ4aGm3\n1RnBDyC+9SPQW9e7CJgNN9PWTmOT51Zyy5IRuV5OWePMQaGLVmCo5zNc/EHZEVRu\n1hxJPHL3NNmkYDY8tye8uHgjsAkv8QuwIuUSqnqjoo1/Yg+P0+9GCpePOAJRNxJS\n0YpDFWc5AgMBAAECggEACIU4/hG+bh97BD7JriFhfDDT6bg7g+pCs/hsAlxQ42jv\nOH7pyWuHJXGf5Cwx31usZAq4fcrgYnVpnyl8odIL628y9AjdI66wMuWhZnBFGJgK\nRhHcZWjW8nlXf0lBjwwFe4edzbn1AuWT5fYZ2HWDW2mthY/e8sUwqWPcWsjdifhz\nNR7V+Ia47McKXYgEKjyEObSP1NUOW24zH0DgxS52YPMwa1FoHn6+9Pr8P3TsTSO6\nh6f8tnd81DGl1UH4F5Bj/MHsQXyAMJbu44S4+rZ4Qlk+5xPp9hfCNpxWaHLIkJCg\nYXnC8UAjjyXiqyK0U0RjJf8TS1FxUI4iPepLNqp/pQKBgQDTicZnWFXmCFTnycWp\n66P3Yx0yvlKdUdfnoD/n9NdmUA3TZUlEVfb0IOm7ZFubF/zDTH87XrRiD/NVDbr8\n6bdhA1DXzraxhbfD36Hca6K74Ba4aYJsSWWwI0hL3FDSsv8c7qAIaUF2iwuHb7Y0\nRDcvZqowtQobcQC8cHLc/bI/ZwKBgQC6fMeGaU+lP6jhp9Nb/3Gz5Z1zzCu34IOo\nlgpTNZsowRKYLtjHifrEFi3XRxPKz5thMuJFniof5U4WoMYtRXy+PbgySvBpCia2\nXty05XssnLLMvLpYU5sbQvmOTe20zaIzLohRvvmqrydYIKu62NTubNeuD1L+Zr0q\nz1P5/wUgXwKBgQCW9MrRFQi3j1qHzkVwbOglsmUzwP3TpoQclw8DyIWuTZKQOMeA\nLJh+vr4NLCDzHLsT45MoGv0+vYM4PwQhV+e1I1idqLZXGMV60iv/0A/hYpjUIPch\nr38RoxwEhsRml7XWP7OUTQiaP7+Kdv3fbo6zFOB+wbLkwk90KgrOCX0aIQKBgFeK\n7esmErJjMPdFXk3om0q09nX+mWNHLOb+EDjBiGXYRM9V5oO9PQ/BzaEqh5sEXE+D\noH7H4cR5U3AB5yYnYYi41ngdf7//eO7Rl1AADhOCN9kum1eNX9mrVhU8deMTSRo3\ntNyTBwbeFF0lcRhUY5jNVW4rWW19cz3ed/B6i8CHAoGBAJ/l5rkV74Z5hg6BWNfQ\nYAg/4PLZmjnXIy5QdnWc/PYgbhn5+iVUcL9fSofFzJM1rjFnNcs3S90MGeOmfmo4\nM1WtcQFQbsCGt6+G5uEL/nf74mKUGpOqEM/XSkZ3inweWiDk3LK3iYfXCMBFouIr\n80IlzI1yMf7MVmWn3e1zPjCA\n-----END PRIVATE KEY-----\n",
-  "client_email": "eveai-349@eveai-420711.iam.gserviceaccount.com",
-  "client_id": "109927035346319712442",
-  "auth_uri": "https://accounts.google.com/o/oauth2/auth",
-  "token_uri": "https://oauth2.googleapis.com/token",
-  "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
-  "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/eveai-349%40eveai-420711.iam.gserviceaccount.com",
-  "universe_domain": "googleapis.com"
-}
--- a/config/prompts/openai/gpt-4o.yaml
+++ b/config/prompts/openai/gpt-4o.yaml
@@ -65,11 +65,13 @@ encyclopedia: |

 transcript: |
  You are a top administrative assistant specialized in transforming given transcriptions into markdown formatted files. The generated files will be used to generate embeddings in a RAG-system. The transcriptions originate from podcast, videos and similar material.
+  You may receive information in different chunks. If you're not receiving the first chunk, you'll get the last part of the previous chunk, including it's title in between triple $. Consider this last part and the title as the start of the new chunk.
+

  # Best practices and steps are:
  - Respect wordings and language(s) used in the transcription. Main language is {language}.
  - Sometimes, the transcript contains speech of several people participating in a conversation. Although these are not obvious from reading the file, try to detect when other people are speaking.    
-  - Divide the transcript into several logical parts. Ensure questions and their answers are in the same logical part.
+  - Divide the transcript into several logical parts. Ensure questions and their answers are in the same logical part. Don't make logical parts too small. They should contain at least 7 or 8 sentences.
  - annotate the text to identify these logical parts using headings in {language}.
  - improve errors in the transcript given the context, but do not change the meaning and intentions of the transcription.

@@ -77,4 +79,6 @@ transcript: |

  The transcript is between triple backquotes.

+  $$${previous_part}$$$
+
  ```{transcript}```