Improve algorithms for HTML and PDF processing
This commit is contained in:
@@ -8,7 +8,7 @@ import ast
|
||||
from typing import List
|
||||
from openai import OpenAI
|
||||
|
||||
from common.models.document import EmbeddingSmallOpenAI
|
||||
from common.models.document import EmbeddingSmallOpenAI, EmbeddingLargeOpenAI
|
||||
|
||||
|
||||
class CitedAnswer(BaseModel):
|
||||
@@ -83,6 +83,10 @@ def select_model_variables(tenant):
|
||||
model_variables['html_included_elements'] = tenant.html_included_elements
|
||||
model_variables['html_excluded_elements'] = tenant.html_excluded_elements
|
||||
|
||||
# Set Chunk Size variables
|
||||
model_variables['min_chunk_size'] = tenant.min_chunk_size
|
||||
model_variables['max_chunk_size'] = tenant.max_chunk_size
|
||||
|
||||
# Set Embedding variables
|
||||
match embedding_provider:
|
||||
case 'openai':
|
||||
@@ -92,8 +96,11 @@ def select_model_variables(tenant):
|
||||
model_variables['embedding_model'] = OpenAIEmbeddings(api_key=api_key,
|
||||
model='text-embedding-3-small')
|
||||
model_variables['embedding_db_model'] = EmbeddingSmallOpenAI
|
||||
model_variables['min_chunk_size'] = current_app.config.get('OAI_TE3S_MIN_CHUNK_SIZE')
|
||||
model_variables['max_chunk_size'] = current_app.config.get('OAI_TE3S_MAX_CHUNK_SIZE')
|
||||
case 'text-embedding-3-large':
|
||||
api_key = current_app.config.get('OPENAI_API_KEY')
|
||||
model_variables['embedding_model'] = OpenAIEmbeddings(api_key=api_key,
|
||||
model='text-embedding-3-large')
|
||||
model_variables['embedding_db_model'] = EmbeddingLargeOpenAI
|
||||
case _:
|
||||
raise Exception(f'Error setting model variables for tenant {tenant.id} '
|
||||
f'error: Invalid embedding model')
|
||||
@@ -119,13 +126,9 @@ def select_model_variables(tenant):
|
||||
history_template = current_app.config.get('GPT4_HISTORY_TEMPLATE')
|
||||
encyclopedia_template = current_app.config.get('GPT4_ENCYCLOPEDIA_TEMPLATE')
|
||||
transcript_template = current_app.config.get('GPT4_TRANSCRIPT_TEMPLATE')
|
||||
html_parse_template = current_app.config.get('GPT4_HTML_PARSE_TEMPLATE')
|
||||
pdf_parse_template = current_app.config.get('GPT4_PDF_PARSE_TEMPLATE')
|
||||
tool_calling_supported = True
|
||||
case 'gpt-3-5-turbo':
|
||||
summary_template = current_app.config.get('GPT3_5_SUMMARY_TEMPLATE')
|
||||
rag_template = current_app.config.get('GPT3_5_RAG_TEMPLATE')
|
||||
history_template = current_app.config.get('GPT3_5_HISTORY_TEMPLATE')
|
||||
encyclopedia_template = current_app.config.get('GPT3_5_ENCYCLOPEDIA_TEMPLATE')
|
||||
transcript_template = current_app.config.get('GPT3_5_TRANSCRIPT_TEMPLATE')
|
||||
case _:
|
||||
raise Exception(f'Error setting model variables for tenant {tenant.id} '
|
||||
f'error: Invalid chat model')
|
||||
@@ -134,6 +137,8 @@ def select_model_variables(tenant):
|
||||
model_variables['history_template'] = history_template
|
||||
model_variables['encyclopedia_template'] = encyclopedia_template
|
||||
model_variables['transcript_template'] = transcript_template
|
||||
model_variables['html_parse_template'] = html_parse_template
|
||||
model_variables['pdf_parse_template'] = pdf_parse_template
|
||||
if tool_calling_supported:
|
||||
model_variables['cited_answer_cls'] = CitedAnswer
|
||||
case _:
|
||||
|
||||
Reference in New Issue
Block a user