- Improve annotation algorithm for Youtube (and others)

- Patch Pytube - improve OS deletion of files and writing of files - Start working on Claude - Improve template management
2024-07-16 14:21:49 +02:00
parent db44fd3b66
commit 908a2eaf7e
39 changed files with 6427 additions and 324 deletions
--- a/common/utils/model_utils.py
+++ b/common/utils/model_utils.py
@@ -1,12 +1,13 @@
 import langcodes
 from flask import current_app
-from langchain_community.embeddings import OpenAIEmbeddings
-from langchain_openai import ChatOpenAI
+from langchain_openai import OpenAIEmbeddings, ChatOpenAI
+from langchain_anthropic import ChatAnthropic
 from langchain_core.pydantic_v1 import BaseModel, Field
 from langchain.prompts import ChatPromptTemplate
 import ast
 from typing import List
 from openai import OpenAI
+# from groq import Groq

 from common.models.document import EmbeddingSmallOpenAI, EmbeddingLargeOpenAI

@@ -121,31 +122,46 @@ def select_model_variables(tenant):
            tool_calling_supported = False
            match llm_model:
                case 'gpt-4-turbo' | 'gpt-4o':
-                    summary_template = current_app.config.get('GPT4_SUMMARY_TEMPLATE')
-                    rag_template = current_app.config.get('GPT4_RAG_TEMPLATE')
-                    history_template = current_app.config.get('GPT4_HISTORY_TEMPLATE')
-                    encyclopedia_template = current_app.config.get('GPT4_ENCYCLOPEDIA_TEMPLATE')
-                    transcript_template = current_app.config.get('GPT4_TRANSCRIPT_TEMPLATE')
-                    html_parse_template = current_app.config.get('GPT4_HTML_PARSE_TEMPLATE')
-                    pdf_parse_template = current_app.config.get('GPT4_PDF_PARSE_TEMPLATE')
                    tool_calling_supported = True
                case _:
                    raise Exception(f'Error setting model variables for tenant {tenant.id} '
                                    f'error: Invalid chat model')
-            model_variables['summary_template'] = summary_template
-            model_variables['rag_template'] = rag_template
-            model_variables['history_template'] = history_template
-            model_variables['encyclopedia_template'] = encyclopedia_template
-            model_variables['transcript_template'] = transcript_template
-            model_variables['html_parse_template'] = html_parse_template
-            model_variables['pdf_parse_template'] = pdf_parse_template
-            if tool_calling_supported:
-                model_variables['cited_answer_cls'] = CitedAnswer
+        case 'anthropic':
+            api_key = current_app.config.get('ANTHROPIC_API_KEY')
+            # Anthropic does not have the same 'generic' model names as OpenAI
+            llm_model_ext = current_app.config.get('ANTHROPIC_LLM_VERSIONS').get(llm_model)
+            model_variables['llm'] = ChatAnthropic(api_key=api_key,
+                                                   model=llm_model_ext,
+                                                   temperature=model_variables['RAG_temperature'])
+            model_variables['llm_no_rag'] = ChatAnthropic(api_key=api_key,
+                                                          model=llm_model_ext,
+                                                          temperature=model_variables['RAG_temperature'])
+            tool_calling_supported = True
        case _:
            raise Exception(f'Error setting model variables for tenant {tenant.id} '
                            f'error: Invalid chat provider')

-    # Transcription Client Variables. Only Whisper-1 of OpenAI is currently supported
+    if tool_calling_supported:
+        model_variables['cited_answer_cls'] = CitedAnswer
+
+    templates = current_app.config['PROMPT_TEMPLATES'][f'{llm_provider}.{llm_model}']
+    model_variables['summary_template'] = templates['summary']
+    model_variables['rag_template'] = templates['rag']
+    model_variables['history_template'] = templates['history']
+    model_variables['encyclopedia_template'] = templates['encyclopedia']
+    model_variables['transcript_template'] = templates['transcript']
+    model_variables['html_parse_template'] = templates['html_parse']
+    model_variables['pdf_parse_template'] = templates['pdf_parse']
+
+    model_variables['annotation_chunk_length'] = current_app.config['ANNOTATION_TEXT_CHUNK_LENGTH'][tenant.llm_model]
+
+    # Transcription Client Variables.
+    # Using Groq
+    # api_key = current_app.config.get('GROQ_API_KEY')
+    # model_variables['transcription_client'] = Groq(api_key=api_key)
+    # model_variables['transcription_model'] = 'whisper-large-v3'
+
+    # Using OpenAI
    api_key = current_app.config.get('OPENAI_API_KEY')
    model_variables['transcription_client'] = OpenAI(api_key=api_key)
    model_variables['transcription_model'] = 'whisper-1'
--- a/common/utils/os_utils.py
+++ b/common/utils/os_utils.py
@@ -0,0 +1,30 @@
+import os
+import gevent
+import time
+from flask import current_app
+
+
+def safe_remove(file_path):
+    try:
+        if os.path.exists(file_path):
+            os.remove(file_path)
+
+            # Wait for the file to be deleted
+            start_time = time.time()
+            while os.path.exists(file_path):
+                gevent.sleep(1)
+                if time.time() - start_time > 5:  # 5 second timeout
+                    raise TimeoutError(f"Failed to delete {file_path} after 5 seconds")
+
+            current_app.logger.info(f"Successfully deleted {file_path}")
+        else:
+            current_app.logger.info(f"{file_path} does not exist, skipping deletion")
+    except Exception as e:
+        current_app.logger.error(f"Error deleting {file_path}: {str(e)}")
+        raise
+
+
+def sync_folder(file_path):
+    dir_fd = os.open(file_path, os.O_RDONLY)
+    os.fsync(dir_fd)
+    os.close(dir_fd)
--- a/common/utils/prompt_loader.py
+++ b/common/utils/prompt_loader.py
@@ -0,0 +1,15 @@
+import os
+import yaml
+
+
+def load_prompt_templates(model_name):
+    provider, model = model_name.split('.')
+    file_path = os.path.join('config', 'prompts', provider, f'{model}.yaml')
+
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"No prompt template file found for {model_name}")
+
+    with open(file_path, 'r') as file:
+        templates = yaml.safe_load(file)
+
+    return templates