- Improve annotation algorithm for Youtube (and others)

- Patch Pytube - improve OS deletion of files and writing of files - Start working on Claude - Improve template management
2024-07-16 14:21:49 +02:00
parent db44fd3b66
commit 908a2eaf7e
39 changed files with 6427 additions and 324 deletions
--- a/config/config.py
+++ b/config/config.py
@@ -2,6 +2,8 @@ from os import environ, path
 from datetime import timedelta
 import redis

+from common.utils.prompt_loader import load_prompt_templates
+
 basedir = path.abspath(path.dirname(__file__))


@@ -59,7 +61,18 @@ class Config(object):

    # supported LLMs
    SUPPORTED_EMBEDDINGS = ['openai.text-embedding-3-small', 'openai.text-embedding-3-large', 'mistral.mistral-embed']
-    SUPPORTED_LLMS = ['openai.gpt-4o', 'openai.gpt-4-turbo']
+    SUPPORTED_LLMS = ['openai.gpt-4o', 'anthropic.claude-3-5-sonnet']
+
+    ANTHROPIC_LLM_VERSIONS = {'claude-3-5-sonnet': 'claude-3-5-sonnet-20240620', }
+
+    # Load prompt templates dynamically
+    PROMPT_TEMPLATES = {model: load_prompt_templates(model) for model in SUPPORTED_LLMS}
+
+    # Annotation text chunk length
+    ANNOTATION_TEXT_CHUNK_LENGTH = {
+        'openai.gpt-4o': 10000,
+        'anthropic.claude-3-5-sonnet': 8000
+    }

    # Celery settings
    CELERY_TASK_SERIALIZER = 'json'
@@ -68,83 +81,6 @@ class Config(object):
    CELERY_TIMEZONE = 'UTC'
    CELERY_ENABLE_UTC = True

-    # Chunk Definition, Embedding dependent
-    # OAI_TE3S_MIN_CHUNK_SIZE = 2000
-    # OAI_TE3S_MAX_CHUNK_SIZE = 3000
-    # OAI_TE3L_MIN_CHUNK_SIZE = 3000
-    # OAI_TE3L_MAX_CHUNK_SIZE = 4000
-
-    # LLM TEMPLATES
-    GPT4_HTML_PARSE_TEMPLATE = """You are a top administrative assistant specialized in transforming given HTML into markdown formatted files. The generated files will be used to generate embeddings in a RAG-system.
-
-    # Best practices are:
-    - Respect wordings and language(s) used in the HTML.
-    - The following items need to be considered: headings, paragraphs, listed items (numbered or not) and tables. Images can be neglected.
-    - Sub-headers can be used as lists. This is true when a header is followed by a series of sub-headers without content (paragraphs or listed items). Present those sub-headers as a list.  
-    - Be careful of encoding of the text. Everything needs to be human readable.
-    
-    Process the file carefully, and take a stepped approach. The resulting markdown should be the result of the processing of the complete input html file. Answer with the pure markdown, without any other text.
-    
-    HTML is between triple backquotes.
-    
-    ```{html}```"""
-
-    GPT4_PDF_PARSE_TEMPLATE = """You are a top administrative aid specialized in transforming given PDF-files into markdown formatted files. The generated files will be used to generate embeddings in a RAG-system.
-
-    # Best practices are:
-    - Respect wordings and language(s) used in the PDF.
-    - The following items need to be considered: headings, paragraphs, listed items (numbered or not) and tables. Images can be neglected.
-    - When headings are numbered, show the numbering and define the header level. 
-    - A new item is started when a <return> is found before a full line is reached. In order to know the number of characters in a line, please check the document and the context within the document (e.g. an image could limit the number of characters temporarily).
-    - Paragraphs are to be stripped of newlines so they become easily readable.
-    - Be careful of encoding of the text. Everything needs to be human readable.
-
-    Process the file carefully, and take a stepped approach. The resulting markdown should be the result of the processing of the complete input pdf content. Answer with the pure markdown, without any other text.
-    
-    PDF content is between triple backquotes.
-    
-    ```{pdf_content}```
-    """
-
-    GPT4_SUMMARY_TEMPLATE = """Write a concise summary of the text in {language}. The text is delimited between triple backquotes.
-    ```{text}```"""
-
-    GPT4_RAG_TEMPLATE = """Answer the question based on the following context, delimited between triple backquotes. 
-    {tenant_context}
-    Use the following {language} in your communication, and cite the sources used.
-    If the question cannot be answered using the given context, say "I have insufficient information to answer this question."
-    Context:
-    ```{context}```
-    Question:
-    {question}"""
-
-    GPT4_HISTORY_TEMPLATE = """You are a helpful assistant that details a question based on a previous context,
-    in such a way that the question is understandable without the previous context. 
-    {tenant_context}
-    The context is a conversation history, with the HUMAN asking questions, the AI answering questions.
-    The history is delimited between triple backquotes.
-    You answer by stating the question in {language}.
-    History:
-    ```{history}```
-    Question to be detailed:
-    {question}"""
-
-    GPT4_ENCYCLOPEDIA_TEMPLATE = """You have a lot of background knowledge, and as such you are some kind of 
-    'encyclopedia' to explain general terminology. Only answer if you have a clear understanding of the question. 
-    If not, say you do not have sufficient information to answer the question. Use the {language} in your communication.
-    Question:
-    {question}"""
-
-    GPT4_TRANSCRIPT_TEMPLATE = """You are a transcription editor that improves a given transcript on several parts 
-    and returns markdown. Without changing what people say. The transcript is delimited between triple backquotes.
-    Do the following:
-    - divide the transcript into several logical parts. Ensure questions and their answers are in the same logical part.
-    - annotate the text to identify these logical parts using headings (max 2 levels) in the same language as the transcript.
-    - improve errors in the transcript given the context, but leave the text intact.
-    
-    ```{transcript}``` 
-    """
-
    # SocketIO settings
    # SOCKETIO_ASYNC_MODE = 'threading'
    SOCKETIO_ASYNC_MODE = 'gevent'
@@ -212,6 +148,9 @@ class DevConfig(Config):
    # Groq API Keys
    GROQ_API_KEY = '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'

+    # Antrhopic API Keys
+    ANTHROPIC_API_KEY = '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'
+
    # Unstructured settings
    UNSTRUCTURED_API_KEY = 'pDgCrXumYhM3CNvjvwV8msMldXC3uw'
    UNSTRUCTURED_BASE_URL = 'https://flowitbv-16c4us0m.api.unstructuredapp.io'