Improve algorithms for HTML and PDF processing

This commit is contained in:
Josako
2024-07-08 15:20:45 +02:00
parent 318d23d8c6
commit ea0127b4b8
6 changed files with 176 additions and 194 deletions

View File

@@ -8,7 +8,7 @@ import ast
from typing import List
from openai import OpenAI
from common.models.document import EmbeddingSmallOpenAI
from common.models.document import EmbeddingSmallOpenAI, EmbeddingLargeOpenAI
class CitedAnswer(BaseModel):
@@ -83,6 +83,10 @@ def select_model_variables(tenant):
model_variables['html_included_elements'] = tenant.html_included_elements
model_variables['html_excluded_elements'] = tenant.html_excluded_elements
# Set Chunk Size variables
model_variables['min_chunk_size'] = tenant.min_chunk_size
model_variables['max_chunk_size'] = tenant.max_chunk_size
# Set Embedding variables
match embedding_provider:
case 'openai':
@@ -92,8 +96,11 @@ def select_model_variables(tenant):
model_variables['embedding_model'] = OpenAIEmbeddings(api_key=api_key,
model='text-embedding-3-small')
model_variables['embedding_db_model'] = EmbeddingSmallOpenAI
model_variables['min_chunk_size'] = current_app.config.get('OAI_TE3S_MIN_CHUNK_SIZE')
model_variables['max_chunk_size'] = current_app.config.get('OAI_TE3S_MAX_CHUNK_SIZE')
case 'text-embedding-3-large':
api_key = current_app.config.get('OPENAI_API_KEY')
model_variables['embedding_model'] = OpenAIEmbeddings(api_key=api_key,
model='text-embedding-3-large')
model_variables['embedding_db_model'] = EmbeddingLargeOpenAI
case _:
raise Exception(f'Error setting model variables for tenant {tenant.id} '
f'error: Invalid embedding model')
@@ -119,13 +126,9 @@ def select_model_variables(tenant):
history_template = current_app.config.get('GPT4_HISTORY_TEMPLATE')
encyclopedia_template = current_app.config.get('GPT4_ENCYCLOPEDIA_TEMPLATE')
transcript_template = current_app.config.get('GPT4_TRANSCRIPT_TEMPLATE')
html_parse_template = current_app.config.get('GPT4_HTML_PARSE_TEMPLATE')
pdf_parse_template = current_app.config.get('GPT4_PDF_PARSE_TEMPLATE')
tool_calling_supported = True
case 'gpt-3-5-turbo':
summary_template = current_app.config.get('GPT3_5_SUMMARY_TEMPLATE')
rag_template = current_app.config.get('GPT3_5_RAG_TEMPLATE')
history_template = current_app.config.get('GPT3_5_HISTORY_TEMPLATE')
encyclopedia_template = current_app.config.get('GPT3_5_ENCYCLOPEDIA_TEMPLATE')
transcript_template = current_app.config.get('GPT3_5_TRANSCRIPT_TEMPLATE')
case _:
raise Exception(f'Error setting model variables for tenant {tenant.id} '
f'error: Invalid chat model')
@@ -134,6 +137,8 @@ def select_model_variables(tenant):
model_variables['history_template'] = history_template
model_variables['encyclopedia_template'] = encyclopedia_template
model_variables['transcript_template'] = transcript_template
model_variables['html_parse_template'] = html_parse_template
model_variables['pdf_parse_template'] = pdf_parse_template
if tool_calling_supported:
model_variables['cited_answer_cls'] = CitedAnswer
case _: