From ea0127b4b849f03b0f78115fdbea6f9639c539e0 Mon Sep 17 00:00:00 2001 From: Josako Date: Mon, 8 Jul 2024 15:20:45 +0200 Subject: [PATCH] Improve algorithms for HTML and PDF processing --- common/utils/model_utils.py | 23 ++-- config/config.py | 78 ++++++------- docker/compose.yaml | 25 ++-- eveai_workers/tasks.py | 219 +++++++++++++++--------------------- logs/eveai_app.log | 22 ++++ requirements.txt | 3 +- 6 files changed, 176 insertions(+), 194 deletions(-) diff --git a/common/utils/model_utils.py b/common/utils/model_utils.py index fcccc98..df3ce7e 100644 --- a/common/utils/model_utils.py +++ b/common/utils/model_utils.py @@ -8,7 +8,7 @@ import ast from typing import List from openai import OpenAI -from common.models.document import EmbeddingSmallOpenAI +from common.models.document import EmbeddingSmallOpenAI, EmbeddingLargeOpenAI class CitedAnswer(BaseModel): @@ -83,6 +83,10 @@ def select_model_variables(tenant): model_variables['html_included_elements'] = tenant.html_included_elements model_variables['html_excluded_elements'] = tenant.html_excluded_elements + # Set Chunk Size variables + model_variables['min_chunk_size'] = tenant.min_chunk_size + model_variables['max_chunk_size'] = tenant.max_chunk_size + # Set Embedding variables match embedding_provider: case 'openai': @@ -92,8 +96,11 @@ def select_model_variables(tenant): model_variables['embedding_model'] = OpenAIEmbeddings(api_key=api_key, model='text-embedding-3-small') model_variables['embedding_db_model'] = EmbeddingSmallOpenAI - model_variables['min_chunk_size'] = current_app.config.get('OAI_TE3S_MIN_CHUNK_SIZE') - model_variables['max_chunk_size'] = current_app.config.get('OAI_TE3S_MAX_CHUNK_SIZE') + case 'text-embedding-3-large': + api_key = current_app.config.get('OPENAI_API_KEY') + model_variables['embedding_model'] = OpenAIEmbeddings(api_key=api_key, + model='text-embedding-3-large') + model_variables['embedding_db_model'] = EmbeddingLargeOpenAI case _: raise Exception(f'Error setting model variables for tenant {tenant.id} ' f'error: Invalid embedding model') @@ -119,13 +126,9 @@ def select_model_variables(tenant): history_template = current_app.config.get('GPT4_HISTORY_TEMPLATE') encyclopedia_template = current_app.config.get('GPT4_ENCYCLOPEDIA_TEMPLATE') transcript_template = current_app.config.get('GPT4_TRANSCRIPT_TEMPLATE') + html_parse_template = current_app.config.get('GPT4_HTML_PARSE_TEMPLATE') + pdf_parse_template = current_app.config.get('GPT4_PDF_PARSE_TEMPLATE') tool_calling_supported = True - case 'gpt-3-5-turbo': - summary_template = current_app.config.get('GPT3_5_SUMMARY_TEMPLATE') - rag_template = current_app.config.get('GPT3_5_RAG_TEMPLATE') - history_template = current_app.config.get('GPT3_5_HISTORY_TEMPLATE') - encyclopedia_template = current_app.config.get('GPT3_5_ENCYCLOPEDIA_TEMPLATE') - transcript_template = current_app.config.get('GPT3_5_TRANSCRIPT_TEMPLATE') case _: raise Exception(f'Error setting model variables for tenant {tenant.id} ' f'error: Invalid chat model') @@ -134,6 +137,8 @@ def select_model_variables(tenant): model_variables['history_template'] = history_template model_variables['encyclopedia_template'] = encyclopedia_template model_variables['transcript_template'] = transcript_template + model_variables['html_parse_template'] = html_parse_template + model_variables['pdf_parse_template'] = pdf_parse_template if tool_calling_supported: model_variables['cited_answer_cls'] = CitedAnswer case _: diff --git a/config/config.py b/config/config.py index 39c0801..b13dd04 100644 --- a/config/config.py +++ b/config/config.py @@ -59,7 +59,7 @@ class Config(object): # supported LLMs SUPPORTED_EMBEDDINGS = ['openai.text-embedding-3-small', 'openai.text-embedding-3-large', 'mistral.mistral-embed'] - SUPPORTED_LLMS = ['openai.gpt-4o', 'openai.gpt-4-turbo', 'openai.gpt-3.5-turbo', 'mistral.mistral-large-2402'] + SUPPORTED_LLMS = ['openai.gpt-4o', 'openai.gpt-4-turbo'] # Celery settings CELERY_TASK_SERIALIZER = 'json' @@ -69,16 +69,45 @@ class Config(object): CELERY_ENABLE_UTC = True # Chunk Definition, Embedding dependent - OAI_TE3S_MIN_CHUNK_SIZE = 2000 - OAI_TE3S_MAX_CHUNK_SIZE = 3000 - OAI_TE3L_MIN_CHUNK_SIZE = 3000 - OAI_TE3L_MAX_CHUNK_SIZE = 4000 + # OAI_TE3S_MIN_CHUNK_SIZE = 2000 + # OAI_TE3S_MAX_CHUNK_SIZE = 3000 + # OAI_TE3L_MIN_CHUNK_SIZE = 3000 + # OAI_TE3L_MAX_CHUNK_SIZE = 4000 # LLM TEMPLATES + GPT4_HTML_PARSE_TEMPLATE = """You are a top administrative assistant specialized in transforming given HTML into markdown formatted files. The generated files will be used to generate embeddings in a RAG-system. + + # Best practices are: + - Respect wordings and language(s) used in the HTML. + - The following items need to be considered: headings, paragraphs, listed items (numbered or not) and tables. Images can be neglected. + - Sub-headers can be used as lists. This is true when a header is followed by a series of sub-headers without content (paragraphs or listed items). Present those sub-headers as a list. + - Be careful of encoding of the text. Everything needs to be human readable. + + Process the file carefully, and take a stepped approach. The resulting markdown should be the result of the processing of the complete input html file. Answer with the pure markdown, without any other text. + + HTML is between triple backquotes. + + ```{html}```""" + + GPT4_PDF_PARSE_TEMPLATE = """You are a top administrative aid specialized in transforming given PDF-files into markdown formatted files. The generated files will be used to generate embeddings in a RAG-system. + + # Best practices are: + - Respect wordings and language(s) used in the PDF. + - The following items need to be considered: headings, paragraphs, listed items (numbered or not) and tables. Images can be neglected. + - When headings are numbered, show the numbering and define the header level. + - A new item is started when a is found before a full line is reached. In order to know the number of characters in a line, please check the document and the context within the document (e.g. an image could limit the number of characters temporarily). + - Paragraphs are to be stripped of newlines so they become easily readable. + - Be careful of encoding of the text. Everything needs to be human readable. + + Process the file carefully, and take a stepped approach. The resulting markdown should be the result of the processing of the complete input pdf content. Answer with the pure markdown, without any other text. + + PDF content is between triple backquotes. + + ```{pdf_content}``` + """ + GPT4_SUMMARY_TEMPLATE = """Write a concise summary of the text in {language}. The text is delimited between triple backquotes. ```{text}```""" - GPT3_5_SUMMARY_TEMPLATE = """Write a concise summary of the text in {language}. The text is delimited between triple backquotes. - ```{text}```""" GPT4_RAG_TEMPLATE = """Answer the question based on the following context, delimited between triple backquotes. {tenant_context} @@ -88,14 +117,6 @@ class Config(object): ```{context}``` Question: {question}""" - GPT3_5_RAG_TEMPLATE = """Answer the question based on the following context, delimited between triple backquotes. - {tenant_context} - Use the following {language} in your communication. - If the question cannot be answered using the given context, say "I have insufficient information to answer this question." - Context: - ```{context}``` - Question: - {question}""" GPT4_HISTORY_TEMPLATE = """You are a helpful assistant that details a question based on a previous context, in such a way that the question is understandable without the previous context. @@ -108,29 +129,12 @@ class Config(object): Question to be detailed: {question}""" - GPT3_5_HISTORY_TEMPLATE = """You are a helpful assistant that details a question based on a previous context, - in such a way that the question is understandable without the previous context. - {tenant_context} - The context is a conversation history, with the HUMAN asking questions, the AI answering questions. - The history is delimited between triple backquotes. - You answer by stating the question in {language}. - History: - ```{history}``` - Question to be detailed: - {question}""" - GPT4_ENCYCLOPEDIA_TEMPLATE = """You have a lot of background knowledge, and as such you are some kind of 'encyclopedia' to explain general terminology. Only answer if you have a clear understanding of the question. If not, say you do not have sufficient information to answer the question. Use the {language} in your communication. Question: {question}""" - GPT3_5_ENCYCLOPEDIA_TEMPLATE = """You have a lot of background knowledge, and as such you are some kind of - 'encyclopedia' to explain general terminology. Only answer if you have a clear understanding of the question. - If not, say you do not have sufficient information to answer the question. Use the {language} in your communication. - Question: - {question}""" - GPT4_TRANSCRIPT_TEMPLATE = """You are a transcription editor that improves a given transcript on several parts and returns markdown. Without changing what people say. The transcript is delimited between triple backquotes. Do the following: @@ -141,16 +145,6 @@ class Config(object): ```{transcript}``` """ - GPT3_5_TRANSCRIPT_TEMPLATE = """You are a transcription editor that improves a given transcript on several parts - and returns markdown. Without changing what people say. The transcript is delimited between triple backquotes. - Do the following: - - divide the transcript into several logical parts. Ensure questions and their answers are in the same logical part. - - annotate the text to identify these logical parts using headings (max 2 levels) in the same language as the transcript. - - improve errors in the transcript given the context, but leave the text intact. - - ```{transcript}``` - """ - # SocketIO settings # SOCKETIO_ASYNC_MODE = 'threading' SOCKETIO_ASYNC_MODE = 'gevent' diff --git a/docker/compose.yaml b/docker/compose.yaml index 51a5f8f..e0cfca8 100644 --- a/docker/compose.yaml +++ b/docker/compose.yaml @@ -7,6 +7,15 @@ # You can add other services your application may depend on here, such as a # database or a cache. For examples, see the Awesome Compose repository: # https://github.com/docker/awesome-compose + +x-common-variables: &common-variables + DB_HOST: db + DB_USER: luke + DB_PASS: Skywalker! + DB_NAME: eveai + FLASK_ENV: development + FLASK_DEBUG: 1 + services: nginx: image: nginx:latest @@ -30,12 +39,7 @@ services: ports: - 5001:5001 environment: - - FLASK_ENV=development - - FLASK_DEBUG=1 - - DB_HOST=db - - DB_USER=luke - - DB_PASS=Skywalker! - - DB_NAME=eveai + <<: *common-variables volumes: - ../eveai_app:/app/eveai_app - ../common:/app/common @@ -63,8 +67,7 @@ services: # ports: # - 5001:5001 environment: - - FLASK_ENV=development - - FLASK_DEBUG=1 + <<: *common-variables volumes: - ../eveai_workers:/app/eveai_workers - ../common:/app/common @@ -91,8 +94,7 @@ services: ports: - 5002:5002 environment: - - FLASK_ENV=development - - FLASK_DEBUG=1 + <<: *common-variables volumes: - ../eveai_chat:/app/eveai_chat - ../common:/app/common @@ -118,8 +120,7 @@ services: # ports: # - 5001:5001 environment: - - FLASK_ENV=development - - FLASK_DEBUG=1 + <<: *common-variables volumes: - ../eveai_chat_workers:/app/eveai_chat_workers - ../common:/app/common diff --git a/eveai_workers/tasks.py b/eveai_workers/tasks.py index d3c90d7..3c3793a 100644 --- a/eveai_workers/tasks.py +++ b/eveai_workers/tasks.py @@ -14,11 +14,8 @@ from langchain_core.output_parsers import StrOutputParser from langchain_core.prompts import ChatPromptTemplate from langchain_core.runnables import RunnablePassthrough from sqlalchemy.exc import SQLAlchemyError -# Unstructured commercial client imports -from unstructured_client import UnstructuredClient -from unstructured_client.models import shared -from unstructured_client.models.errors import SDKError from pytube import YouTube +import PyPDF2 from common.extensions import db from common.models.document import DocumentVersion, Embedding @@ -105,22 +102,19 @@ def create_embeddings(tenant_id, document_version_id): def process_pdf(tenant, model_variables, document_version): + base_path = os.path.join(current_app.config['UPLOAD_FOLDER'], + document_version.file_location) file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], document_version.file_location, document_version.file_name) if os.path.exists(file_path): - with open(file_path, 'rb') as f: - files = shared.Files(content=f.read(), file_name=document_version.file_name) - req = shared.PartitionParameters( - files=files, - strategy='hi_res', - hi_res_model_name='yolox', - coordinates=True, - extract_image_block_types=['Image', 'Table'], - chunking_strategy='by_title', - combine_under_n_chars=model_variables['min_chunk_size'], - max_characters=model_variables['max_chunk_size'], - ) + pdf_text = '' + # Function to extract text from PDF and return as string + with open(file_path, 'rb') as file: + reader = PyPDF2.PdfReader(file) + for page_num in range(len(reader.pages)): + page = reader.pages[page_num] + pdf_text += page.extract_text() else: current_app.logger.error(f'The physical file for document version {document_version.id} ' f'for tenant {tenant.id} ' @@ -128,17 +122,22 @@ def process_pdf(tenant, model_variables, document_version): create_embeddings.update_state(state=states.FAILURE) raise - try: - chunks = partition_doc_unstructured(tenant, document_version, req) - except Exception as e: - current_app.logger.error(f'Unable to create Embeddings for tenant {tenant.id} ' - f'while processing PDF on document version {document_version.id} ' - f'error: {e}') - create_embeddings.update_state(state=states.FAILURE) - raise + markdown = generate_markdown_from_pdf(tenant, model_variables, document_version, pdf_text) + markdown_file_name = f'{document_version.id}.md' + output_file = os.path.join(base_path, markdown_file_name) + with open(output_file, 'w') as f: + f.write(markdown) + + potential_chunks = create_potential_chunks_for_markdown(base_path, markdown_file_name, tenant) + chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'], + model_variables['max_chunk_size']) + + if len(chunks) > 1: + summary = summarize_chunk(tenant, model_variables, document_version, chunks[0]) + document_version.system_context = f'Summary: {summary}\n' + else: + document_version.system_context = '' - summary = summarize_chunk(tenant, model_variables, document_version, chunks[0]) - document_version.system_context = f'Summary: {summary}\n' enriched_chunks = enrich_chunks(tenant, document_version, chunks) embeddings = embed_chunks(tenant, model_variables, document_version, enriched_chunks) @@ -150,10 +149,8 @@ def process_pdf(tenant, model_variables, document_version): db.session.commit() except SQLAlchemyError as e: current_app.logger.error(f'Error saving embedding information for tenant {tenant.id} ' - f'on PDF, document version {document_version.id}' + f'on HTML, document version {document_version.id}' f'error: {e}') - db.session.rollback() - create_embeddings.update_state(state=states.FAILURE) raise current_app.logger.info(f'Embeddings created successfully for tenant {tenant.id} ' @@ -179,6 +176,9 @@ def process_html(tenant, model_variables, document_version): html_included_elements = model_variables['html_included_elements'] html_excluded_elements = model_variables['html_excluded_elements'] + base_path = os.path.join(current_app.config['UPLOAD_FOLDER'], + document_version.file_location) + file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], document_version.file_location, document_version.file_name) @@ -193,16 +193,22 @@ def process_html(tenant, model_variables, document_version): create_embeddings.update_state(state=states.FAILURE) raise - extracted_data, title = parse_html(html_content, html_tags, included_elements=html_included_elements, + extracted_html, title = parse_html(html_content, html_tags, included_elements=html_included_elements, excluded_elements=html_excluded_elements) - potential_chunks = create_potential_chunks(extracted_data, html_end_tags) - current_app.embed_tuning_logger.debug(f'Nr of potential chunks: {len(potential_chunks)}') + extracted_file_name = f'{document_version.id}-extracted.html' + output_file = os.path.join(base_path, extracted_file_name) + with open(output_file, 'w') as f: + f.write(extracted_html) - chunks = combine_chunks(potential_chunks, - model_variables['min_chunk_size'], - model_variables['max_chunk_size'] - ) - current_app.logger.debug(f'Nr of chunks: {len(chunks)}') + markdown = generate_markdown_from_html(tenant, model_variables, document_version, extracted_html) + markdown_file_name = f'{document_version.id}.md' + output_file = os.path.join(base_path, markdown_file_name) + with open(output_file, 'w') as f: + f.write(markdown) + + potential_chunks = create_potential_chunks_for_markdown(base_path, markdown_file_name, tenant) + chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'], + model_variables['max_chunk_size']) if len(chunks) > 1: summary = summarize_chunk(tenant, model_variables, document_version, chunks[0]) @@ -253,6 +259,40 @@ def enrich_chunks(tenant, document_version, chunks): return enriched_chunks +def generate_markdown_from_html(tenant, model_variables, document_version, html_content): + current_app.logger.debug(f'Generating Markdown from HTML for tenant {tenant.id} ' + f'on document version {document_version.id}') + llm = model_variables['llm'] + template = model_variables['html_parse_template'] + parse_prompt = ChatPromptTemplate.from_template(template) + setup = RunnablePassthrough() + output_parser = StrOutputParser() + + chain = setup | parse_prompt | llm | output_parser + input_html = {"html": html_content} + + markdown = chain.invoke(input_html) + + return markdown + + +def generate_markdown_from_pdf(tenant, model_variables, document_version, pdf_content): + current_app.logger.debug(f'Generating Markdown from PDF for tenant {tenant.id} ' + f'on document version {document_version.id}') + llm = model_variables['llm'] + template = model_variables['pdf_parse_template'] + parse_prompt = ChatPromptTemplate.from_template(template) + setup = RunnablePassthrough() + output_parser = StrOutputParser() + + chain = setup | parse_prompt | llm | output_parser + input_pdf = {"pdf_content": pdf_content} + + markdown = chain.invoke(input_pdf) + + return markdown + + def summarize_chunk(tenant, model_variables, document_version, chunk): current_app.logger.debug(f'Summarizing chunk for tenant {tenant.id} ' f'on document version {document_version.id}') @@ -277,33 +317,6 @@ def summarize_chunk(tenant, model_variables, document_version, chunk): raise -def partition_doc_unstructured(tenant, document_version, unstructured_request): - current_app.logger.debug(f'Partitioning document version {document_version.id} for tenant {tenant.id}') - # Initiate the connection to unstructured.io - url = current_app.config.get('UNSTRUCTURED_FULL_URL') - api_key = current_app.config.get('UNSTRUCTURED_API_KEY') - unstructured_client = UnstructuredClient(server_url=url, api_key_auth=api_key) - - try: - res = unstructured_client.general.partition(unstructured_request) - chunks = [] - for el in res.elements: - match el['type']: - case 'CompositeElement': - chunks.append(el['text']) - case 'Image': - pass - case 'Table': - chunks.append(el['metadata']['text_as_html']) - current_app.logger.debug(f'Finished partioning document version {document_version.id} for tenant {tenant.id}') - return chunks - except SDKError as e: - current_app.logger.error(f'Error creating embeddings for tenant {tenant.id} ' - f'on document version {document_version.id} while chuncking' - f'error: {e}') - raise - - def embed_chunks(tenant, model_variables, document_version, chunks): current_app.logger.debug(f'Embedding chunks for tenant {tenant.id} ' f'on document version {document_version.id}') @@ -334,7 +347,7 @@ def embed_chunks(tenant, model_variables, document_version, chunks): def parse_html(html_content, tags, included_elements=None, excluded_elements=None): soup = BeautifulSoup(html_content, 'html.parser') - extracted_content = [] + extracted_html = '' if included_elements: elements_to_parse = soup.find_all(included_elements) @@ -353,82 +366,28 @@ def parse_html(html_content, tags, included_elements=None, excluded_elements=Non if excluded_elements and sub_element.find_parent(excluded_elements): continue # Skip this sub_element if it's within any of the excluded_elements sub_content = html.unescape(sub_element.get_text(strip=False)) - extracted_content.append((sub_element.name, sub_content)) + extracted_html += f'<{sub_element.name}>{sub_element.get_text(strip=True)}\n' title = soup.find('title').get_text(strip=True) - return extracted_content, title - - -def create_potential_chunks(extracted_data, end_tags): - potential_chunks = [] - current_chunk = [] - - for tag, text in extracted_data: - formatted_text = f"- {text}" if tag == 'li' else f"{text}\n" - if current_chunk and tag in end_tags and current_chunk[-1][0] in end_tags: - # Consecutive li and p elements stay together - current_chunk.append((tag, formatted_text)) - else: - # End the current chunk if the last element was an end tag - if current_chunk and current_chunk[-1][0] in end_tags: - potential_chunks.append(current_chunk) - current_chunk = [] - current_chunk.append((tag, formatted_text)) - - # Add the last chunk - if current_chunk: - potential_chunks.append(current_chunk) - return potential_chunks - - -def combine_chunks(potential_chunks, min_chars, max_chars): - actual_chunks = [] - current_chunk = "" - current_length = 0 - - for chunk in potential_chunks: - current_app.embed_tuning_logger.debug(f'chunk: {chunk}') - chunk_content = ''.join(text for _, text in chunk) - current_app.embed_tuning_logger.debug(f'chunk_content: {chunk_content}') - chunk_length = len(chunk_content) - - if current_length + chunk_length > max_chars: - if current_length >= min_chars: - current_app.embed_tuning_logger.debug(f'Adding chunk to actual_chunks: {current_chunk}') - actual_chunks.append(current_chunk) - current_chunk = chunk_content - current_length = chunk_length - else: - # If the combined chunk is still less than max_chars, keep adding - current_chunk += chunk_content - current_length += chunk_length - else: - current_chunk += chunk_content - current_length += chunk_length - - current_app.embed_tuning_logger.debug(f'Remaining Chunk: {current_chunk}') - current_app.embed_tuning_logger.debug(f'Remaining Length: {current_length}') - - # Handle the last chunk - if current_chunk and current_length >= 0: - actual_chunks.append(current_chunk) - - return actual_chunks + return extracted_html, title def process_youtube(tenant, model_variables, document_version): base_path = os.path.join(current_app.config['UPLOAD_FOLDER'], document_version.file_location) - # clean old files if necessary + download_file_name = f'{document_version.id}.mp4' + compressed_file_name = f'{document_version.id}.mp3' + transcription_file_name = f'{document_version.id}.txt' + markdown_file_name = f'{document_version.id}.md' - of, title, description, author = download_youtube(document_version.url, base_path, 'downloaded.mp4', tenant) + of, title, description, author = download_youtube(document_version.url, base_path, download_file_name, tenant) document_version.system_context = f'Title: {title}\nDescription: {description}\nAuthor: {author}' - compress_audio(base_path, 'downloaded.mp4', 'compressed.mp3', tenant) - transcribe_audio(base_path, 'compressed.mp3', 'transcription.txt', document_version.language, tenant, model_variables) - annotate_transcription(base_path, 'transcription.txt', 'transcription.md', tenant, model_variables) + compress_audio(base_path, download_file_name, compressed_file_name, tenant) + transcribe_audio(base_path, compressed_file_name, transcription_file_name, document_version.language, tenant, model_variables) + annotate_transcription(base_path, transcription_file_name, markdown_file_name, tenant, model_variables) - potential_chunks = create_potential_chunks_for_markdown(base_path, 'transcription.md', tenant) + potential_chunks = create_potential_chunks_for_markdown(base_path, markdown_file_name, tenant) actual_chunks = combine_chunks_for_markdown(potential_chunks, model_variables['min_chunk_size'], model_variables['max_chunk_size']) enriched_chunks = enrich_chunks(tenant, document_version, actual_chunks) diff --git a/logs/eveai_app.log b/logs/eveai_app.log index b789027..474547f 100644 --- a/logs/eveai_app.log +++ b/logs/eveai_app.log @@ -64,3 +64,25 @@ 2024-07-04 15:49:43,253 [DEBUG] eveai_app: CELERY_RESULT_BACKEND: redis://redis:6379/0 2024-07-04 15:49:43,271 [INFO] eveai_app: EveAI App Server Started Successfully 2024-07-04 15:49:43,271 [INFO] eveai_app: ------------------------------------------------------------------------------------------------- +2024-07-08 12:15:29,532 [INFO] eveai_app: eveai_app starting up +2024-07-08 12:15:29,562 [INFO] eveai_app: Project ID: eveai-420711 +2024-07-08 12:15:29,562 [INFO] eveai_app: Location: europe-west1 +2024-07-08 12:15:29,562 [INFO] eveai_app: Key Ring: eveai-chat +2024-07-08 12:15:29,562 [INFO] eveai_app: Crypto Key: envelope-encryption-key +2024-07-08 12:15:29,563 [INFO] eveai_app: Key Name: projects/eveai-420711/locations/europe-west1/keyRings/eveai-chat/cryptoKeys/envelope-encryption-key +2024-07-08 12:15:29,563 [INFO] eveai_app: Service Account Key Path: None +2024-07-08 12:15:29,573 [DEBUG] eveai_app: CELERY_BROKER_URL: redis://redis:6379/0 +2024-07-08 12:15:29,573 [DEBUG] eveai_app: CELERY_RESULT_BACKEND: redis://redis:6379/0 +2024-07-08 12:15:29,611 [INFO] eveai_app: EveAI App Server Started Successfully +2024-07-08 12:15:29,611 [INFO] eveai_app: ------------------------------------------------------------------------------------------------- +2024-07-08 12:16:20,375 [INFO] eveai_app: eveai_app starting up +2024-07-08 12:16:20,398 [INFO] eveai_app: Project ID: eveai-420711 +2024-07-08 12:16:20,398 [INFO] eveai_app: Location: europe-west1 +2024-07-08 12:16:20,398 [INFO] eveai_app: Key Ring: eveai-chat +2024-07-08 12:16:20,398 [INFO] eveai_app: Crypto Key: envelope-encryption-key +2024-07-08 12:16:20,398 [INFO] eveai_app: Key Name: projects/eveai-420711/locations/europe-west1/keyRings/eveai-chat/cryptoKeys/envelope-encryption-key +2024-07-08 12:16:20,398 [INFO] eveai_app: Service Account Key Path: None +2024-07-08 12:16:20,402 [DEBUG] eveai_app: CELERY_BROKER_URL: redis://redis:6379/0 +2024-07-08 12:16:20,402 [DEBUG] eveai_app: CELERY_RESULT_BACKEND: redis://redis:6379/0 +2024-07-08 12:16:20,421 [INFO] eveai_app: EveAI App Server Started Successfully +2024-07-08 12:16:20,421 [INFO] eveai_app: ------------------------------------------------------------------------------------------------- diff --git a/requirements.txt b/requirements.txt index 37aaea3..cebd6a7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -169,4 +169,5 @@ zope.event==5.0 zope.interface==6.3 zxcvbn==4.4.28 -pytube~=15.0.0 \ No newline at end of file +pytube~=15.0.0 +PyPDF2~=3.0.1 \ No newline at end of file