Improve algorithms for HTML and PDF processing
This commit is contained in:
@@ -59,7 +59,7 @@ class Config(object):
|
||||
|
||||
# supported LLMs
|
||||
SUPPORTED_EMBEDDINGS = ['openai.text-embedding-3-small', 'openai.text-embedding-3-large', 'mistral.mistral-embed']
|
||||
SUPPORTED_LLMS = ['openai.gpt-4o', 'openai.gpt-4-turbo', 'openai.gpt-3.5-turbo', 'mistral.mistral-large-2402']
|
||||
SUPPORTED_LLMS = ['openai.gpt-4o', 'openai.gpt-4-turbo']
|
||||
|
||||
# Celery settings
|
||||
CELERY_TASK_SERIALIZER = 'json'
|
||||
@@ -69,16 +69,45 @@ class Config(object):
|
||||
CELERY_ENABLE_UTC = True
|
||||
|
||||
# Chunk Definition, Embedding dependent
|
||||
OAI_TE3S_MIN_CHUNK_SIZE = 2000
|
||||
OAI_TE3S_MAX_CHUNK_SIZE = 3000
|
||||
OAI_TE3L_MIN_CHUNK_SIZE = 3000
|
||||
OAI_TE3L_MAX_CHUNK_SIZE = 4000
|
||||
# OAI_TE3S_MIN_CHUNK_SIZE = 2000
|
||||
# OAI_TE3S_MAX_CHUNK_SIZE = 3000
|
||||
# OAI_TE3L_MIN_CHUNK_SIZE = 3000
|
||||
# OAI_TE3L_MAX_CHUNK_SIZE = 4000
|
||||
|
||||
# LLM TEMPLATES
|
||||
GPT4_HTML_PARSE_TEMPLATE = """You are a top administrative assistant specialized in transforming given HTML into markdown formatted files. The generated files will be used to generate embeddings in a RAG-system.
|
||||
|
||||
# Best practices are:
|
||||
- Respect wordings and language(s) used in the HTML.
|
||||
- The following items need to be considered: headings, paragraphs, listed items (numbered or not) and tables. Images can be neglected.
|
||||
- Sub-headers can be used as lists. This is true when a header is followed by a series of sub-headers without content (paragraphs or listed items). Present those sub-headers as a list.
|
||||
- Be careful of encoding of the text. Everything needs to be human readable.
|
||||
|
||||
Process the file carefully, and take a stepped approach. The resulting markdown should be the result of the processing of the complete input html file. Answer with the pure markdown, without any other text.
|
||||
|
||||
HTML is between triple backquotes.
|
||||
|
||||
```{html}```"""
|
||||
|
||||
GPT4_PDF_PARSE_TEMPLATE = """You are a top administrative aid specialized in transforming given PDF-files into markdown formatted files. The generated files will be used to generate embeddings in a RAG-system.
|
||||
|
||||
# Best practices are:
|
||||
- Respect wordings and language(s) used in the PDF.
|
||||
- The following items need to be considered: headings, paragraphs, listed items (numbered or not) and tables. Images can be neglected.
|
||||
- When headings are numbered, show the numbering and define the header level.
|
||||
- A new item is started when a <return> is found before a full line is reached. In order to know the number of characters in a line, please check the document and the context within the document (e.g. an image could limit the number of characters temporarily).
|
||||
- Paragraphs are to be stripped of newlines so they become easily readable.
|
||||
- Be careful of encoding of the text. Everything needs to be human readable.
|
||||
|
||||
Process the file carefully, and take a stepped approach. The resulting markdown should be the result of the processing of the complete input pdf content. Answer with the pure markdown, without any other text.
|
||||
|
||||
PDF content is between triple backquotes.
|
||||
|
||||
```{pdf_content}```
|
||||
"""
|
||||
|
||||
GPT4_SUMMARY_TEMPLATE = """Write a concise summary of the text in {language}. The text is delimited between triple backquotes.
|
||||
```{text}```"""
|
||||
GPT3_5_SUMMARY_TEMPLATE = """Write a concise summary of the text in {language}. The text is delimited between triple backquotes.
|
||||
```{text}```"""
|
||||
|
||||
GPT4_RAG_TEMPLATE = """Answer the question based on the following context, delimited between triple backquotes.
|
||||
{tenant_context}
|
||||
@@ -88,14 +117,6 @@ class Config(object):
|
||||
```{context}```
|
||||
Question:
|
||||
{question}"""
|
||||
GPT3_5_RAG_TEMPLATE = """Answer the question based on the following context, delimited between triple backquotes.
|
||||
{tenant_context}
|
||||
Use the following {language} in your communication.
|
||||
If the question cannot be answered using the given context, say "I have insufficient information to answer this question."
|
||||
Context:
|
||||
```{context}```
|
||||
Question:
|
||||
{question}"""
|
||||
|
||||
GPT4_HISTORY_TEMPLATE = """You are a helpful assistant that details a question based on a previous context,
|
||||
in such a way that the question is understandable without the previous context.
|
||||
@@ -108,29 +129,12 @@ class Config(object):
|
||||
Question to be detailed:
|
||||
{question}"""
|
||||
|
||||
GPT3_5_HISTORY_TEMPLATE = """You are a helpful assistant that details a question based on a previous context,
|
||||
in such a way that the question is understandable without the previous context.
|
||||
{tenant_context}
|
||||
The context is a conversation history, with the HUMAN asking questions, the AI answering questions.
|
||||
The history is delimited between triple backquotes.
|
||||
You answer by stating the question in {language}.
|
||||
History:
|
||||
```{history}```
|
||||
Question to be detailed:
|
||||
{question}"""
|
||||
|
||||
GPT4_ENCYCLOPEDIA_TEMPLATE = """You have a lot of background knowledge, and as such you are some kind of
|
||||
'encyclopedia' to explain general terminology. Only answer if you have a clear understanding of the question.
|
||||
If not, say you do not have sufficient information to answer the question. Use the {language} in your communication.
|
||||
Question:
|
||||
{question}"""
|
||||
|
||||
GPT3_5_ENCYCLOPEDIA_TEMPLATE = """You have a lot of background knowledge, and as such you are some kind of
|
||||
'encyclopedia' to explain general terminology. Only answer if you have a clear understanding of the question.
|
||||
If not, say you do not have sufficient information to answer the question. Use the {language} in your communication.
|
||||
Question:
|
||||
{question}"""
|
||||
|
||||
GPT4_TRANSCRIPT_TEMPLATE = """You are a transcription editor that improves a given transcript on several parts
|
||||
and returns markdown. Without changing what people say. The transcript is delimited between triple backquotes.
|
||||
Do the following:
|
||||
@@ -141,16 +145,6 @@ class Config(object):
|
||||
```{transcript}```
|
||||
"""
|
||||
|
||||
GPT3_5_TRANSCRIPT_TEMPLATE = """You are a transcription editor that improves a given transcript on several parts
|
||||
and returns markdown. Without changing what people say. The transcript is delimited between triple backquotes.
|
||||
Do the following:
|
||||
- divide the transcript into several logical parts. Ensure questions and their answers are in the same logical part.
|
||||
- annotate the text to identify these logical parts using headings (max 2 levels) in the same language as the transcript.
|
||||
- improve errors in the transcript given the context, but leave the text intact.
|
||||
|
||||
```{transcript}```
|
||||
"""
|
||||
|
||||
# SocketIO settings
|
||||
# SOCKETIO_ASYNC_MODE = 'threading'
|
||||
SOCKETIO_ASYNC_MODE = 'gevent'
|
||||
|
||||
Reference in New Issue
Block a user