Improve algorithms for HTML and PDF processing

This commit is contained in:
Josako
2024-07-08 15:20:45 +02:00
parent 318d23d8c6
commit ea0127b4b8
6 changed files with 176 additions and 194 deletions

View File

@@ -8,7 +8,7 @@ import ast
from typing import List from typing import List
from openai import OpenAI from openai import OpenAI
from common.models.document import EmbeddingSmallOpenAI from common.models.document import EmbeddingSmallOpenAI, EmbeddingLargeOpenAI
class CitedAnswer(BaseModel): class CitedAnswer(BaseModel):
@@ -83,6 +83,10 @@ def select_model_variables(tenant):
model_variables['html_included_elements'] = tenant.html_included_elements model_variables['html_included_elements'] = tenant.html_included_elements
model_variables['html_excluded_elements'] = tenant.html_excluded_elements model_variables['html_excluded_elements'] = tenant.html_excluded_elements
# Set Chunk Size variables
model_variables['min_chunk_size'] = tenant.min_chunk_size
model_variables['max_chunk_size'] = tenant.max_chunk_size
# Set Embedding variables # Set Embedding variables
match embedding_provider: match embedding_provider:
case 'openai': case 'openai':
@@ -92,8 +96,11 @@ def select_model_variables(tenant):
model_variables['embedding_model'] = OpenAIEmbeddings(api_key=api_key, model_variables['embedding_model'] = OpenAIEmbeddings(api_key=api_key,
model='text-embedding-3-small') model='text-embedding-3-small')
model_variables['embedding_db_model'] = EmbeddingSmallOpenAI model_variables['embedding_db_model'] = EmbeddingSmallOpenAI
model_variables['min_chunk_size'] = current_app.config.get('OAI_TE3S_MIN_CHUNK_SIZE') case 'text-embedding-3-large':
model_variables['max_chunk_size'] = current_app.config.get('OAI_TE3S_MAX_CHUNK_SIZE') api_key = current_app.config.get('OPENAI_API_KEY')
model_variables['embedding_model'] = OpenAIEmbeddings(api_key=api_key,
model='text-embedding-3-large')
model_variables['embedding_db_model'] = EmbeddingLargeOpenAI
case _: case _:
raise Exception(f'Error setting model variables for tenant {tenant.id} ' raise Exception(f'Error setting model variables for tenant {tenant.id} '
f'error: Invalid embedding model') f'error: Invalid embedding model')
@@ -119,13 +126,9 @@ def select_model_variables(tenant):
history_template = current_app.config.get('GPT4_HISTORY_TEMPLATE') history_template = current_app.config.get('GPT4_HISTORY_TEMPLATE')
encyclopedia_template = current_app.config.get('GPT4_ENCYCLOPEDIA_TEMPLATE') encyclopedia_template = current_app.config.get('GPT4_ENCYCLOPEDIA_TEMPLATE')
transcript_template = current_app.config.get('GPT4_TRANSCRIPT_TEMPLATE') transcript_template = current_app.config.get('GPT4_TRANSCRIPT_TEMPLATE')
html_parse_template = current_app.config.get('GPT4_HTML_PARSE_TEMPLATE')
pdf_parse_template = current_app.config.get('GPT4_PDF_PARSE_TEMPLATE')
tool_calling_supported = True tool_calling_supported = True
case 'gpt-3-5-turbo':
summary_template = current_app.config.get('GPT3_5_SUMMARY_TEMPLATE')
rag_template = current_app.config.get('GPT3_5_RAG_TEMPLATE')
history_template = current_app.config.get('GPT3_5_HISTORY_TEMPLATE')
encyclopedia_template = current_app.config.get('GPT3_5_ENCYCLOPEDIA_TEMPLATE')
transcript_template = current_app.config.get('GPT3_5_TRANSCRIPT_TEMPLATE')
case _: case _:
raise Exception(f'Error setting model variables for tenant {tenant.id} ' raise Exception(f'Error setting model variables for tenant {tenant.id} '
f'error: Invalid chat model') f'error: Invalid chat model')
@@ -134,6 +137,8 @@ def select_model_variables(tenant):
model_variables['history_template'] = history_template model_variables['history_template'] = history_template
model_variables['encyclopedia_template'] = encyclopedia_template model_variables['encyclopedia_template'] = encyclopedia_template
model_variables['transcript_template'] = transcript_template model_variables['transcript_template'] = transcript_template
model_variables['html_parse_template'] = html_parse_template
model_variables['pdf_parse_template'] = pdf_parse_template
if tool_calling_supported: if tool_calling_supported:
model_variables['cited_answer_cls'] = CitedAnswer model_variables['cited_answer_cls'] = CitedAnswer
case _: case _:

View File

@@ -59,7 +59,7 @@ class Config(object):
# supported LLMs # supported LLMs
SUPPORTED_EMBEDDINGS = ['openai.text-embedding-3-small', 'openai.text-embedding-3-large', 'mistral.mistral-embed'] SUPPORTED_EMBEDDINGS = ['openai.text-embedding-3-small', 'openai.text-embedding-3-large', 'mistral.mistral-embed']
SUPPORTED_LLMS = ['openai.gpt-4o', 'openai.gpt-4-turbo', 'openai.gpt-3.5-turbo', 'mistral.mistral-large-2402'] SUPPORTED_LLMS = ['openai.gpt-4o', 'openai.gpt-4-turbo']
# Celery settings # Celery settings
CELERY_TASK_SERIALIZER = 'json' CELERY_TASK_SERIALIZER = 'json'
@@ -69,16 +69,45 @@ class Config(object):
CELERY_ENABLE_UTC = True CELERY_ENABLE_UTC = True
# Chunk Definition, Embedding dependent # Chunk Definition, Embedding dependent
OAI_TE3S_MIN_CHUNK_SIZE = 2000 # OAI_TE3S_MIN_CHUNK_SIZE = 2000
OAI_TE3S_MAX_CHUNK_SIZE = 3000 # OAI_TE3S_MAX_CHUNK_SIZE = 3000
OAI_TE3L_MIN_CHUNK_SIZE = 3000 # OAI_TE3L_MIN_CHUNK_SIZE = 3000
OAI_TE3L_MAX_CHUNK_SIZE = 4000 # OAI_TE3L_MAX_CHUNK_SIZE = 4000
# LLM TEMPLATES # LLM TEMPLATES
GPT4_HTML_PARSE_TEMPLATE = """You are a top administrative assistant specialized in transforming given HTML into markdown formatted files. The generated files will be used to generate embeddings in a RAG-system.
# Best practices are:
- Respect wordings and language(s) used in the HTML.
- The following items need to be considered: headings, paragraphs, listed items (numbered or not) and tables. Images can be neglected.
- Sub-headers can be used as lists. This is true when a header is followed by a series of sub-headers without content (paragraphs or listed items). Present those sub-headers as a list.
- Be careful of encoding of the text. Everything needs to be human readable.
Process the file carefully, and take a stepped approach. The resulting markdown should be the result of the processing of the complete input html file. Answer with the pure markdown, without any other text.
HTML is between triple backquotes.
```{html}```"""
GPT4_PDF_PARSE_TEMPLATE = """You are a top administrative aid specialized in transforming given PDF-files into markdown formatted files. The generated files will be used to generate embeddings in a RAG-system.
# Best practices are:
- Respect wordings and language(s) used in the PDF.
- The following items need to be considered: headings, paragraphs, listed items (numbered or not) and tables. Images can be neglected.
- When headings are numbered, show the numbering and define the header level.
- A new item is started when a <return> is found before a full line is reached. In order to know the number of characters in a line, please check the document and the context within the document (e.g. an image could limit the number of characters temporarily).
- Paragraphs are to be stripped of newlines so they become easily readable.
- Be careful of encoding of the text. Everything needs to be human readable.
Process the file carefully, and take a stepped approach. The resulting markdown should be the result of the processing of the complete input pdf content. Answer with the pure markdown, without any other text.
PDF content is between triple backquotes.
```{pdf_content}```
"""
GPT4_SUMMARY_TEMPLATE = """Write a concise summary of the text in {language}. The text is delimited between triple backquotes. GPT4_SUMMARY_TEMPLATE = """Write a concise summary of the text in {language}. The text is delimited between triple backquotes.
```{text}```""" ```{text}```"""
GPT3_5_SUMMARY_TEMPLATE = """Write a concise summary of the text in {language}. The text is delimited between triple backquotes.
```{text}```"""
GPT4_RAG_TEMPLATE = """Answer the question based on the following context, delimited between triple backquotes. GPT4_RAG_TEMPLATE = """Answer the question based on the following context, delimited between triple backquotes.
{tenant_context} {tenant_context}
@@ -88,14 +117,6 @@ class Config(object):
```{context}``` ```{context}```
Question: Question:
{question}""" {question}"""
GPT3_5_RAG_TEMPLATE = """Answer the question based on the following context, delimited between triple backquotes.
{tenant_context}
Use the following {language} in your communication.
If the question cannot be answered using the given context, say "I have insufficient information to answer this question."
Context:
```{context}```
Question:
{question}"""
GPT4_HISTORY_TEMPLATE = """You are a helpful assistant that details a question based on a previous context, GPT4_HISTORY_TEMPLATE = """You are a helpful assistant that details a question based on a previous context,
in such a way that the question is understandable without the previous context. in such a way that the question is understandable without the previous context.
@@ -108,29 +129,12 @@ class Config(object):
Question to be detailed: Question to be detailed:
{question}""" {question}"""
GPT3_5_HISTORY_TEMPLATE = """You are a helpful assistant that details a question based on a previous context,
in such a way that the question is understandable without the previous context.
{tenant_context}
The context is a conversation history, with the HUMAN asking questions, the AI answering questions.
The history is delimited between triple backquotes.
You answer by stating the question in {language}.
History:
```{history}```
Question to be detailed:
{question}"""
GPT4_ENCYCLOPEDIA_TEMPLATE = """You have a lot of background knowledge, and as such you are some kind of GPT4_ENCYCLOPEDIA_TEMPLATE = """You have a lot of background knowledge, and as such you are some kind of
'encyclopedia' to explain general terminology. Only answer if you have a clear understanding of the question. 'encyclopedia' to explain general terminology. Only answer if you have a clear understanding of the question.
If not, say you do not have sufficient information to answer the question. Use the {language} in your communication. If not, say you do not have sufficient information to answer the question. Use the {language} in your communication.
Question: Question:
{question}""" {question}"""
GPT3_5_ENCYCLOPEDIA_TEMPLATE = """You have a lot of background knowledge, and as such you are some kind of
'encyclopedia' to explain general terminology. Only answer if you have a clear understanding of the question.
If not, say you do not have sufficient information to answer the question. Use the {language} in your communication.
Question:
{question}"""
GPT4_TRANSCRIPT_TEMPLATE = """You are a transcription editor that improves a given transcript on several parts GPT4_TRANSCRIPT_TEMPLATE = """You are a transcription editor that improves a given transcript on several parts
and returns markdown. Without changing what people say. The transcript is delimited between triple backquotes. and returns markdown. Without changing what people say. The transcript is delimited between triple backquotes.
Do the following: Do the following:
@@ -141,16 +145,6 @@ class Config(object):
```{transcript}``` ```{transcript}```
""" """
GPT3_5_TRANSCRIPT_TEMPLATE = """You are a transcription editor that improves a given transcript on several parts
and returns markdown. Without changing what people say. The transcript is delimited between triple backquotes.
Do the following:
- divide the transcript into several logical parts. Ensure questions and their answers are in the same logical part.
- annotate the text to identify these logical parts using headings (max 2 levels) in the same language as the transcript.
- improve errors in the transcript given the context, but leave the text intact.
```{transcript}```
"""
# SocketIO settings # SocketIO settings
# SOCKETIO_ASYNC_MODE = 'threading' # SOCKETIO_ASYNC_MODE = 'threading'
SOCKETIO_ASYNC_MODE = 'gevent' SOCKETIO_ASYNC_MODE = 'gevent'

View File

@@ -7,6 +7,15 @@
# You can add other services your application may depend on here, such as a # You can add other services your application may depend on here, such as a
# database or a cache. For examples, see the Awesome Compose repository: # database or a cache. For examples, see the Awesome Compose repository:
# https://github.com/docker/awesome-compose # https://github.com/docker/awesome-compose
x-common-variables: &common-variables
DB_HOST: db
DB_USER: luke
DB_PASS: Skywalker!
DB_NAME: eveai
FLASK_ENV: development
FLASK_DEBUG: 1
services: services:
nginx: nginx:
image: nginx:latest image: nginx:latest
@@ -30,12 +39,7 @@ services:
ports: ports:
- 5001:5001 - 5001:5001
environment: environment:
- FLASK_ENV=development <<: *common-variables
- FLASK_DEBUG=1
- DB_HOST=db
- DB_USER=luke
- DB_PASS=Skywalker!
- DB_NAME=eveai
volumes: volumes:
- ../eveai_app:/app/eveai_app - ../eveai_app:/app/eveai_app
- ../common:/app/common - ../common:/app/common
@@ -63,8 +67,7 @@ services:
# ports: # ports:
# - 5001:5001 # - 5001:5001
environment: environment:
- FLASK_ENV=development <<: *common-variables
- FLASK_DEBUG=1
volumes: volumes:
- ../eveai_workers:/app/eveai_workers - ../eveai_workers:/app/eveai_workers
- ../common:/app/common - ../common:/app/common
@@ -91,8 +94,7 @@ services:
ports: ports:
- 5002:5002 - 5002:5002
environment: environment:
- FLASK_ENV=development <<: *common-variables
- FLASK_DEBUG=1
volumes: volumes:
- ../eveai_chat:/app/eveai_chat - ../eveai_chat:/app/eveai_chat
- ../common:/app/common - ../common:/app/common
@@ -118,8 +120,7 @@ services:
# ports: # ports:
# - 5001:5001 # - 5001:5001
environment: environment:
- FLASK_ENV=development <<: *common-variables
- FLASK_DEBUG=1
volumes: volumes:
- ../eveai_chat_workers:/app/eveai_chat_workers - ../eveai_chat_workers:/app/eveai_chat_workers
- ../common:/app/common - ../common:/app/common

View File

@@ -14,11 +14,8 @@ from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough from langchain_core.runnables import RunnablePassthrough
from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.exc import SQLAlchemyError
# Unstructured commercial client imports
from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError
from pytube import YouTube from pytube import YouTube
import PyPDF2
from common.extensions import db from common.extensions import db
from common.models.document import DocumentVersion, Embedding from common.models.document import DocumentVersion, Embedding
@@ -105,22 +102,19 @@ def create_embeddings(tenant_id, document_version_id):
def process_pdf(tenant, model_variables, document_version): def process_pdf(tenant, model_variables, document_version):
base_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
document_version.file_location)
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], file_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
document_version.file_location, document_version.file_location,
document_version.file_name) document_version.file_name)
if os.path.exists(file_path): if os.path.exists(file_path):
with open(file_path, 'rb') as f: pdf_text = ''
files = shared.Files(content=f.read(), file_name=document_version.file_name) # Function to extract text from PDF and return as string
req = shared.PartitionParameters( with open(file_path, 'rb') as file:
files=files, reader = PyPDF2.PdfReader(file)
strategy='hi_res', for page_num in range(len(reader.pages)):
hi_res_model_name='yolox', page = reader.pages[page_num]
coordinates=True, pdf_text += page.extract_text()
extract_image_block_types=['Image', 'Table'],
chunking_strategy='by_title',
combine_under_n_chars=model_variables['min_chunk_size'],
max_characters=model_variables['max_chunk_size'],
)
else: else:
current_app.logger.error(f'The physical file for document version {document_version.id} ' current_app.logger.error(f'The physical file for document version {document_version.id} '
f'for tenant {tenant.id} ' f'for tenant {tenant.id} '
@@ -128,17 +122,22 @@ def process_pdf(tenant, model_variables, document_version):
create_embeddings.update_state(state=states.FAILURE) create_embeddings.update_state(state=states.FAILURE)
raise raise
try: markdown = generate_markdown_from_pdf(tenant, model_variables, document_version, pdf_text)
chunks = partition_doc_unstructured(tenant, document_version, req) markdown_file_name = f'{document_version.id}.md'
except Exception as e: output_file = os.path.join(base_path, markdown_file_name)
current_app.logger.error(f'Unable to create Embeddings for tenant {tenant.id} ' with open(output_file, 'w') as f:
f'while processing PDF on document version {document_version.id} ' f.write(markdown)
f'error: {e}')
create_embeddings.update_state(state=states.FAILURE) potential_chunks = create_potential_chunks_for_markdown(base_path, markdown_file_name, tenant)
raise chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
model_variables['max_chunk_size'])
if len(chunks) > 1:
summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
document_version.system_context = f'Summary: {summary}\n'
else:
document_version.system_context = ''
summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
document_version.system_context = f'Summary: {summary}\n'
enriched_chunks = enrich_chunks(tenant, document_version, chunks) enriched_chunks = enrich_chunks(tenant, document_version, chunks)
embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks) embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks)
@@ -150,10 +149,8 @@ def process_pdf(tenant, model_variables, document_version):
db.session.commit() db.session.commit()
except SQLAlchemyError as e: except SQLAlchemyError as e:
current_app.logger.error(f'Error saving embedding information for tenant {tenant.id} ' current_app.logger.error(f'Error saving embedding information for tenant {tenant.id} '
f'on PDF, document version {document_version.id}' f'on HTML, document version {document_version.id}'
f'error: {e}') f'error: {e}')
db.session.rollback()
create_embeddings.update_state(state=states.FAILURE)
raise raise
current_app.logger.info(f'Embeddings created successfully for tenant {tenant.id} ' current_app.logger.info(f'Embeddings created successfully for tenant {tenant.id} '
@@ -179,6 +176,9 @@ def process_html(tenant, model_variables, document_version):
html_included_elements = model_variables['html_included_elements'] html_included_elements = model_variables['html_included_elements']
html_excluded_elements = model_variables['html_excluded_elements'] html_excluded_elements = model_variables['html_excluded_elements']
base_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
document_version.file_location)
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], file_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
document_version.file_location, document_version.file_location,
document_version.file_name) document_version.file_name)
@@ -193,16 +193,22 @@ def process_html(tenant, model_variables, document_version):
create_embeddings.update_state(state=states.FAILURE) create_embeddings.update_state(state=states.FAILURE)
raise raise
extracted_data, title = parse_html(html_content, html_tags, included_elements=html_included_elements, extracted_html, title = parse_html(html_content, html_tags, included_elements=html_included_elements,
excluded_elements=html_excluded_elements) excluded_elements=html_excluded_elements)
potential_chunks = create_potential_chunks(extracted_data, html_end_tags) extracted_file_name = f'{document_version.id}-extracted.html'
current_app.embed_tuning_logger.debug(f'Nr of potential chunks: {len(potential_chunks)}') output_file = os.path.join(base_path, extracted_file_name)
with open(output_file, 'w') as f:
f.write(extracted_html)
chunks = combine_chunks(potential_chunks, markdown = generate_markdown_from_html(tenant, model_variables, document_version, extracted_html)
model_variables['min_chunk_size'], markdown_file_name = f'{document_version.id}.md'
model_variables['max_chunk_size'] output_file = os.path.join(base_path, markdown_file_name)
) with open(output_file, 'w') as f:
current_app.logger.debug(f'Nr of chunks: {len(chunks)}') f.write(markdown)
potential_chunks = create_potential_chunks_for_markdown(base_path, markdown_file_name, tenant)
chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
model_variables['max_chunk_size'])
if len(chunks) > 1: if len(chunks) > 1:
summary = summarize_chunk(tenant, model_variables, document_version, chunks[0]) summary = summarize_chunk(tenant, model_variables, document_version, chunks[0])
@@ -253,6 +259,40 @@ def enrich_chunks(tenant, document_version, chunks):
return enriched_chunks return enriched_chunks
def generate_markdown_from_html(tenant, model_variables, document_version, html_content):
current_app.logger.debug(f'Generating Markdown from HTML for tenant {tenant.id} '
f'on document version {document_version.id}')
llm = model_variables['llm']
template = model_variables['html_parse_template']
parse_prompt = ChatPromptTemplate.from_template(template)
setup = RunnablePassthrough()
output_parser = StrOutputParser()
chain = setup | parse_prompt | llm | output_parser
input_html = {"html": html_content}
markdown = chain.invoke(input_html)
return markdown
def generate_markdown_from_pdf(tenant, model_variables, document_version, pdf_content):
current_app.logger.debug(f'Generating Markdown from PDF for tenant {tenant.id} '
f'on document version {document_version.id}')
llm = model_variables['llm']
template = model_variables['pdf_parse_template']
parse_prompt = ChatPromptTemplate.from_template(template)
setup = RunnablePassthrough()
output_parser = StrOutputParser()
chain = setup | parse_prompt | llm | output_parser
input_pdf = {"pdf_content": pdf_content}
markdown = chain.invoke(input_pdf)
return markdown
def summarize_chunk(tenant, model_variables, document_version, chunk): def summarize_chunk(tenant, model_variables, document_version, chunk):
current_app.logger.debug(f'Summarizing chunk for tenant {tenant.id} ' current_app.logger.debug(f'Summarizing chunk for tenant {tenant.id} '
f'on document version {document_version.id}') f'on document version {document_version.id}')
@@ -277,33 +317,6 @@ def summarize_chunk(tenant, model_variables, document_version, chunk):
raise raise
def partition_doc_unstructured(tenant, document_version, unstructured_request):
current_app.logger.debug(f'Partitioning document version {document_version.id} for tenant {tenant.id}')
# Initiate the connection to unstructured.io
url = current_app.config.get('UNSTRUCTURED_FULL_URL')
api_key = current_app.config.get('UNSTRUCTURED_API_KEY')
unstructured_client = UnstructuredClient(server_url=url, api_key_auth=api_key)
try:
res = unstructured_client.general.partition(unstructured_request)
chunks = []
for el in res.elements:
match el['type']:
case 'CompositeElement':
chunks.append(el['text'])
case 'Image':
pass
case 'Table':
chunks.append(el['metadata']['text_as_html'])
current_app.logger.debug(f'Finished partioning document version {document_version.id} for tenant {tenant.id}')
return chunks
except SDKError as e:
current_app.logger.error(f'Error creating embeddings for tenant {tenant.id} '
f'on document version {document_version.id} while chuncking'
f'error: {e}')
raise
def embed_chunks(tenant, model_variables, document_version, chunks): def embed_chunks(tenant, model_variables, document_version, chunks):
current_app.logger.debug(f'Embedding chunks for tenant {tenant.id} ' current_app.logger.debug(f'Embedding chunks for tenant {tenant.id} '
f'on document version {document_version.id}') f'on document version {document_version.id}')
@@ -334,7 +347,7 @@ def embed_chunks(tenant, model_variables, document_version, chunks):
def parse_html(html_content, tags, included_elements=None, excluded_elements=None): def parse_html(html_content, tags, included_elements=None, excluded_elements=None):
soup = BeautifulSoup(html_content, 'html.parser') soup = BeautifulSoup(html_content, 'html.parser')
extracted_content = [] extracted_html = ''
if included_elements: if included_elements:
elements_to_parse = soup.find_all(included_elements) elements_to_parse = soup.find_all(included_elements)
@@ -353,82 +366,28 @@ def parse_html(html_content, tags, included_elements=None, excluded_elements=Non
if excluded_elements and sub_element.find_parent(excluded_elements): if excluded_elements and sub_element.find_parent(excluded_elements):
continue # Skip this sub_element if it's within any of the excluded_elements continue # Skip this sub_element if it's within any of the excluded_elements
sub_content = html.unescape(sub_element.get_text(strip=False)) sub_content = html.unescape(sub_element.get_text(strip=False))
extracted_content.append((sub_element.name, sub_content)) extracted_html += f'<{sub_element.name}>{sub_element.get_text(strip=True)}</{sub_element.name}>\n'
title = soup.find('title').get_text(strip=True) title = soup.find('title').get_text(strip=True)
return extracted_content, title return extracted_html, title
def create_potential_chunks(extracted_data, end_tags):
potential_chunks = []
current_chunk = []
for tag, text in extracted_data:
formatted_text = f"- {text}" if tag == 'li' else f"{text}\n"
if current_chunk and tag in end_tags and current_chunk[-1][0] in end_tags:
# Consecutive li and p elements stay together
current_chunk.append((tag, formatted_text))
else:
# End the current chunk if the last element was an end tag
if current_chunk and current_chunk[-1][0] in end_tags:
potential_chunks.append(current_chunk)
current_chunk = []
current_chunk.append((tag, formatted_text))
# Add the last chunk
if current_chunk:
potential_chunks.append(current_chunk)
return potential_chunks
def combine_chunks(potential_chunks, min_chars, max_chars):
actual_chunks = []
current_chunk = ""
current_length = 0
for chunk in potential_chunks:
current_app.embed_tuning_logger.debug(f'chunk: {chunk}')
chunk_content = ''.join(text for _, text in chunk)
current_app.embed_tuning_logger.debug(f'chunk_content: {chunk_content}')
chunk_length = len(chunk_content)
if current_length + chunk_length > max_chars:
if current_length >= min_chars:
current_app.embed_tuning_logger.debug(f'Adding chunk to actual_chunks: {current_chunk}')
actual_chunks.append(current_chunk)
current_chunk = chunk_content
current_length = chunk_length
else:
# If the combined chunk is still less than max_chars, keep adding
current_chunk += chunk_content
current_length += chunk_length
else:
current_chunk += chunk_content
current_length += chunk_length
current_app.embed_tuning_logger.debug(f'Remaining Chunk: {current_chunk}')
current_app.embed_tuning_logger.debug(f'Remaining Length: {current_length}')
# Handle the last chunk
if current_chunk and current_length >= 0:
actual_chunks.append(current_chunk)
return actual_chunks
def process_youtube(tenant, model_variables, document_version): def process_youtube(tenant, model_variables, document_version):
base_path = os.path.join(current_app.config['UPLOAD_FOLDER'], base_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
document_version.file_location) document_version.file_location)
# clean old files if necessary download_file_name = f'{document_version.id}.mp4'
compressed_file_name = f'{document_version.id}.mp3'
transcription_file_name = f'{document_version.id}.txt'
markdown_file_name = f'{document_version.id}.md'
of, title, description, author = download_youtube(document_version.url, base_path, 'downloaded.mp4', tenant) of, title, description, author = download_youtube(document_version.url, base_path, download_file_name, tenant)
document_version.system_context = f'Title: {title}\nDescription: {description}\nAuthor: {author}' document_version.system_context = f'Title: {title}\nDescription: {description}\nAuthor: {author}'
compress_audio(base_path, 'downloaded.mp4', 'compressed.mp3', tenant) compress_audio(base_path, download_file_name, compressed_file_name, tenant)
transcribe_audio(base_path, 'compressed.mp3', 'transcription.txt', document_version.language, tenant, model_variables) transcribe_audio(base_path, compressed_file_name, transcription_file_name, document_version.language, tenant, model_variables)
annotate_transcription(base_path, 'transcription.txt', 'transcription.md', tenant, model_variables) annotate_transcription(base_path, transcription_file_name, markdown_file_name, tenant, model_variables)
potential_chunks = create_potential_chunks_for_markdown(base_path, 'transcription.md', tenant) potential_chunks = create_potential_chunks_for_markdown(base_path, markdown_file_name, tenant)
actual_chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'], actual_chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'],
model_variables['max_chunk_size']) model_variables['max_chunk_size'])
enriched_chunks = enrich_chunks(tenant, document_version, actual_chunks) enriched_chunks = enrich_chunks(tenant, document_version, actual_chunks)

View File

@@ -64,3 +64,25 @@
2024-07-04 15:49:43,253 [DEBUG] eveai_app: CELERY_RESULT_BACKEND: redis://redis:6379/0 2024-07-04 15:49:43,253 [DEBUG] eveai_app: CELERY_RESULT_BACKEND: redis://redis:6379/0
2024-07-04 15:49:43,271 [INFO] eveai_app: EveAI App Server Started Successfully 2024-07-04 15:49:43,271 [INFO] eveai_app: EveAI App Server Started Successfully
2024-07-04 15:49:43,271 [INFO] eveai_app: ------------------------------------------------------------------------------------------------- 2024-07-04 15:49:43,271 [INFO] eveai_app: -------------------------------------------------------------------------------------------------
2024-07-08 12:15:29,532 [INFO] eveai_app: eveai_app starting up
2024-07-08 12:15:29,562 [INFO] eveai_app: Project ID: eveai-420711
2024-07-08 12:15:29,562 [INFO] eveai_app: Location: europe-west1
2024-07-08 12:15:29,562 [INFO] eveai_app: Key Ring: eveai-chat
2024-07-08 12:15:29,562 [INFO] eveai_app: Crypto Key: envelope-encryption-key
2024-07-08 12:15:29,563 [INFO] eveai_app: Key Name: projects/eveai-420711/locations/europe-west1/keyRings/eveai-chat/cryptoKeys/envelope-encryption-key
2024-07-08 12:15:29,563 [INFO] eveai_app: Service Account Key Path: None
2024-07-08 12:15:29,573 [DEBUG] eveai_app: CELERY_BROKER_URL: redis://redis:6379/0
2024-07-08 12:15:29,573 [DEBUG] eveai_app: CELERY_RESULT_BACKEND: redis://redis:6379/0
2024-07-08 12:15:29,611 [INFO] eveai_app: EveAI App Server Started Successfully
2024-07-08 12:15:29,611 [INFO] eveai_app: -------------------------------------------------------------------------------------------------
2024-07-08 12:16:20,375 [INFO] eveai_app: eveai_app starting up
2024-07-08 12:16:20,398 [INFO] eveai_app: Project ID: eveai-420711
2024-07-08 12:16:20,398 [INFO] eveai_app: Location: europe-west1
2024-07-08 12:16:20,398 [INFO] eveai_app: Key Ring: eveai-chat
2024-07-08 12:16:20,398 [INFO] eveai_app: Crypto Key: envelope-encryption-key
2024-07-08 12:16:20,398 [INFO] eveai_app: Key Name: projects/eveai-420711/locations/europe-west1/keyRings/eveai-chat/cryptoKeys/envelope-encryption-key
2024-07-08 12:16:20,398 [INFO] eveai_app: Service Account Key Path: None
2024-07-08 12:16:20,402 [DEBUG] eveai_app: CELERY_BROKER_URL: redis://redis:6379/0
2024-07-08 12:16:20,402 [DEBUG] eveai_app: CELERY_RESULT_BACKEND: redis://redis:6379/0
2024-07-08 12:16:20,421 [INFO] eveai_app: EveAI App Server Started Successfully
2024-07-08 12:16:20,421 [INFO] eveai_app: -------------------------------------------------------------------------------------------------

View File

@@ -169,4 +169,5 @@ zope.event==5.0
zope.interface==6.3 zope.interface==6.3
zxcvbn==4.4.28 zxcvbn==4.4.28
pytube~=15.0.0 pytube~=15.0.0
PyPDF2~=3.0.1