- Improve annotation algorithm for Youtube (and others)

- Patch Pytube
- improve OS deletion of files and writing of files
- Start working on Claude
- Improve template management
This commit is contained in:
Josako
2024-07-16 14:21:49 +02:00
parent db44fd3b66
commit 908a2eaf7e
39 changed files with 6427 additions and 324 deletions

View File

@@ -1,12 +1,13 @@
import langcodes
from flask import current_app
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.prompts import ChatPromptTemplate
import ast
from typing import List
from openai import OpenAI
# from groq import Groq
from common.models.document import EmbeddingSmallOpenAI, EmbeddingLargeOpenAI
@@ -121,31 +122,46 @@ def select_model_variables(tenant):
tool_calling_supported = False
match llm_model:
case 'gpt-4-turbo' | 'gpt-4o':
summary_template = current_app.config.get('GPT4_SUMMARY_TEMPLATE')
rag_template = current_app.config.get('GPT4_RAG_TEMPLATE')
history_template = current_app.config.get('GPT4_HISTORY_TEMPLATE')
encyclopedia_template = current_app.config.get('GPT4_ENCYCLOPEDIA_TEMPLATE')
transcript_template = current_app.config.get('GPT4_TRANSCRIPT_TEMPLATE')
html_parse_template = current_app.config.get('GPT4_HTML_PARSE_TEMPLATE')
pdf_parse_template = current_app.config.get('GPT4_PDF_PARSE_TEMPLATE')
tool_calling_supported = True
case _:
raise Exception(f'Error setting model variables for tenant {tenant.id} '
f'error: Invalid chat model')
model_variables['summary_template'] = summary_template
model_variables['rag_template'] = rag_template
model_variables['history_template'] = history_template
model_variables['encyclopedia_template'] = encyclopedia_template
model_variables['transcript_template'] = transcript_template
model_variables['html_parse_template'] = html_parse_template
model_variables['pdf_parse_template'] = pdf_parse_template
if tool_calling_supported:
model_variables['cited_answer_cls'] = CitedAnswer
case 'anthropic':
api_key = current_app.config.get('ANTHROPIC_API_KEY')
# Anthropic does not have the same 'generic' model names as OpenAI
llm_model_ext = current_app.config.get('ANTHROPIC_LLM_VERSIONS').get(llm_model)
model_variables['llm'] = ChatAnthropic(api_key=api_key,
model=llm_model_ext,
temperature=model_variables['RAG_temperature'])
model_variables['llm_no_rag'] = ChatAnthropic(api_key=api_key,
model=llm_model_ext,
temperature=model_variables['RAG_temperature'])
tool_calling_supported = True
case _:
raise Exception(f'Error setting model variables for tenant {tenant.id} '
f'error: Invalid chat provider')
# Transcription Client Variables. Only Whisper-1 of OpenAI is currently supported
if tool_calling_supported:
model_variables['cited_answer_cls'] = CitedAnswer
templates = current_app.config['PROMPT_TEMPLATES'][f'{llm_provider}.{llm_model}']
model_variables['summary_template'] = templates['summary']
model_variables['rag_template'] = templates['rag']
model_variables['history_template'] = templates['history']
model_variables['encyclopedia_template'] = templates['encyclopedia']
model_variables['transcript_template'] = templates['transcript']
model_variables['html_parse_template'] = templates['html_parse']
model_variables['pdf_parse_template'] = templates['pdf_parse']
model_variables['annotation_chunk_length'] = current_app.config['ANNOTATION_TEXT_CHUNK_LENGTH'][tenant.llm_model]
# Transcription Client Variables.
# Using Groq
# api_key = current_app.config.get('GROQ_API_KEY')
# model_variables['transcription_client'] = Groq(api_key=api_key)
# model_variables['transcription_model'] = 'whisper-large-v3'
# Using OpenAI
api_key = current_app.config.get('OPENAI_API_KEY')
model_variables['transcription_client'] = OpenAI(api_key=api_key)
model_variables['transcription_model'] = 'whisper-1'