Realise processing of HTML and improve both HTML & PDF processing giving new tenant information.

This commit is contained in:
Josako
2024-05-13 17:18:38 +02:00
parent adee283d7a
commit 6c2e99f467
4 changed files with 253 additions and 111 deletions

View File

@@ -59,6 +59,9 @@ class Config(object):
GPT4_SUMMARY_TEMPLATE = """Write a concise summary of the text in the same language as the provided text. GPT4_SUMMARY_TEMPLATE = """Write a concise summary of the text in the same language as the provided text.
Text is delimited between triple backquotes. Text is delimited between triple backquotes.
```{text}```""" ```{text}```"""
GPT3_5_SUMMARY_TEMPLATE = """Write a concise summary of the text in the same language as the provided text.
Text is delimited between triple backquotes.
```{text}```"""
class DevConfig(Config): class DevConfig(Config):

View File

@@ -13,7 +13,6 @@ class AddDocumentForm(FlaskForm):
language = SelectField('Language', choices=[], validators=[Optional()]) language = SelectField('Language', choices=[], validators=[Optional()])
user_context = TextAreaField('User Context', validators=[Optional()]) user_context = TextAreaField('User Context', validators=[Optional()])
valid_from = DateField('Valid from', id='form-control datepicker', validators=[Optional()]) valid_from = DateField('Valid from', id='form-control datepicker', validators=[Optional()])
doc_embedding_model = SelectField('Default Embedding Model', choices=[], validators=[DataRequired()])
submit = SubmitField('Submit') submit = SubmitField('Submit')
@@ -23,8 +22,6 @@ class AddDocumentForm(FlaskForm):
session.get('tenant').get('allowed_languages')] session.get('tenant').get('allowed_languages')]
self.language.data = session.get('default_language') self.language.data = session.get('default_language')
self.doc_embedding_model.data = session.get('embedding_model')
class AddURLForm(FlaskForm): class AddURLForm(FlaskForm):
url = URLField('URL', validators=[DataRequired(), URL()]) url = URLField('URL', validators=[DataRequired(), URL()])
@@ -32,7 +29,6 @@ class AddURLForm(FlaskForm):
language = SelectField('Language', choices=[], validators=[Optional()]) language = SelectField('Language', choices=[], validators=[Optional()])
user_context = TextAreaField('User Context', validators=[Optional()]) user_context = TextAreaField('User Context', validators=[Optional()])
valid_from = DateField('Valid from', id='form-control datepicker', validators=[Optional()]) valid_from = DateField('Valid from', id='form-control datepicker', validators=[Optional()])
doc_embedding_model = SelectField('Embedding Model', choices=[], validators=[DataRequired()])
submit = SubmitField('Submit') submit = SubmitField('Submit')
@@ -41,4 +37,3 @@ class AddURLForm(FlaskForm):
self.language.choices = [(language, language) for language in self.language.choices = [(language, language) for language in
session.get('tenant').get('allowed_languages')] session.get('tenant').get('allowed_languages')]
self.language.data = session.get('default_language') self.language.data = session.get('default_language')
self.doc_embedding_model.data = session.get('default_embedding_model')

View File

@@ -38,7 +38,15 @@ def add_document():
filename = secure_filename(file.filename) filename = secure_filename(file.filename)
extension = filename.rsplit('.', 1)[1].lower() extension = filename.rsplit('.', 1)[1].lower()
create_document_stack(form, file, filename, extension) new_doc, new_doc_lang, new_doc_vers = create_document_stack(form, file, filename, extension)
task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
session['tenant']['id'],
new_doc_vers.id,
])
current_app.logger.info(f'Embedding creation started for tenant {session["tenant"]["id"]}, '
f'Document Version {new_doc_vers.id}. '
f'Embedding creation task: {task.id}')
return redirect(url_for('document_bp.documents')) return redirect(url_for('document_bp.documents'))
@@ -67,7 +75,16 @@ def add_url():
filename += '.html' filename += '.html'
extension = 'html' extension = 'html'
create_document_stack(form, file, filename, extension) new_doc, new_doc_lang, new_doc_vers = create_document_stack(form, file, filename, extension)
task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
session['tenant']['id'],
new_doc_vers.id,
])
current_app.logger.info(f'Embedding creation started for tenant {session["tenant"]["id"]}, '
f'Document Version {new_doc_vers.id}. '
f'Embedding creation task: {task.id}')
return redirect(url_for('document_bp.documents')) return redirect(url_for('document_bp.documents'))
return render_template('document/add_url.html', form=form) return render_template('document/add_url.html', form=form)
@@ -123,9 +140,7 @@ def create_document_stack(form, file, filename, extension):
db.session.add(new_doc) db.session.add(new_doc)
db.session.add(new_doc_lang) db.session.add(new_doc_lang)
db.session.add(new_doc_vers) db.session.add(new_doc_vers)
log_session_state(db.session, "Before first commit")
db.session.commit() db.session.commit()
log_session_state(db.session, "After first commit")
except SQLAlchemyError as e: except SQLAlchemyError as e:
current_app.logger.error(f'Error adding document for tenant {session["tenant"]["id"]}: {e}') current_app.logger.error(f'Error adding document for tenant {session["tenant"]["id"]}: {e}')
flash('Error adding document.', 'error') flash('Error adding document.', 'error')
@@ -140,9 +155,7 @@ def create_document_stack(form, file, filename, extension):
new_doc_lang = db.session.merge(new_doc_lang) new_doc_lang = db.session.merge(new_doc_lang)
new_doc_vers = db.session.merge(new_doc_vers) new_doc_vers = db.session.merge(new_doc_vers)
new_doc_lang.latest_version_id = new_doc_vers.id new_doc_lang.latest_version_id = new_doc_vers.id
log_session_state(db.session, "Before second commit")
db.session.commit() db.session.commit()
log_session_state(db.session, "After second commit")
except SQLAlchemyError as e: except SQLAlchemyError as e:
current_app.logger.error(f'Error adding document for tenant {session["tenant"]["id"]}: {e}') current_app.logger.error(f'Error adding document for tenant {session["tenant"]["id"]}: {e}')
flash('Error adding document.', 'error') flash('Error adding document.', 'error')
@@ -160,15 +173,8 @@ def create_document_stack(form, file, filename, extension):
f'Document Version {new_doc.id}') f'Document Version {new_doc.id}')
upload_file_for_version(new_doc_vers, file, extension) upload_file_for_version(new_doc_vers, file, extension)
task = current_celery.send_task('create_embeddings', queue='embeddings', args=[
session['tenant']['id'],
new_doc_vers.id,
session['default_embedding_model'],
])
current_app.logger.info(f'Embedding creation started for tenant {session["tenant"]["id"]}, ' return new_doc, new_doc_lang, new_doc_vers
f'Document Version {new_doc_vers.id}. '
f'Embedding creation task: {task.id}')
def log_session_state(session, msg=""): def log_session_state(session, msg=""):

View File

@@ -1,7 +1,8 @@
from datetime import datetime as dt, timezone as tz from datetime import datetime as dt, timezone as tz
from flask import current_app from flask import current_app
from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.exc import SQLAlchemyError
from celery import states
from celery.exceptions import Ignore
import os import os
# Unstructured commercial client imports # Unstructured commercial client imports
@@ -18,6 +19,7 @@ from langchain_core.exceptions import LangChainException
from common.utils.database import Database from common.utils.database import Database
from common.models.document import DocumentVersion, EmbeddingMistral, EmbeddingSmallOpenAI from common.models.document import DocumentVersion, EmbeddingMistral, EmbeddingSmallOpenAI
from common.models.user import Tenant
from common.extensions import db from common.extensions import db
from common.utils.celery_utils import current_celery from common.utils.celery_utils import current_celery
@@ -25,14 +27,21 @@ from bs4 import BeautifulSoup
@current_celery.task(name='create_embeddings', queue='embeddings') @current_celery.task(name='create_embeddings', queue='embeddings')
def create_embeddings(tenant_id, document_version_id, default_embedding_model): def create_embeddings(tenant_id, document_version_id):
# Setup Remote Debugging only if PYCHARM_DEBUG=True # Setup Remote Debugging only if PYCHARM_DEBUG=True
if current_app.config['PYCHARM_DEBUG']: if current_app.config['PYCHARM_DEBUG']:
import pydevd_pycharm import pydevd_pycharm
pydevd_pycharm.settrace('localhost', port=50170, stdoutToServer=True, stderrToServer=True) pydevd_pycharm.settrace('localhost', port=50170, stdoutToServer=True, stderrToServer=True)
current_app.logger.info(f'Creating embeddings for tenant {tenant_id} on document version {document_version_id} ' current_app.logger.info(f'Creating embeddings for tenant {tenant_id} on document version {document_version_id}.')
f'with model {default_embedding_model}')
# Retrieve Tenant for which we are processing
tenant = Tenant.query.get(tenant_id)
if tenant is None:
current_app.logger.error(f'Cannot create embeddings for tenant {tenant_id}. '
f'Tenant not found')
create_embeddings.update_state(state=states.FAILURE)
raise Ignore()
# Ensure we are working in the correct database schema # Ensure we are working in the correct database schema
Database(tenant_id).switch_schema() Database(tenant_id).switch_schema()
@@ -42,7 +51,9 @@ def create_embeddings(tenant_id, document_version_id, default_embedding_model):
if document_version is None: if document_version is None:
current_app.logger.error(f'Cannot create embeddings for tenant {tenant_id}. ' current_app.logger.error(f'Cannot create embeddings for tenant {tenant_id}. '
f'Document version {document_version_id} not found') f'Document version {document_version_id} not found')
return create_embeddings.update_state(state=states.FAILURE)
raise Ignore()
db.session.add(document_version) db.session.add(document_version)
# start processing # start processing
@@ -52,29 +63,22 @@ def create_embeddings(tenant_id, document_version_id, default_embedding_model):
db.session.commit() db.session.commit()
except SQLAlchemyError as e: except SQLAlchemyError as e:
current_app.logger.error(f'Error saving document version {document_version_id} to database ' current_app.logger.error(f'Error saving document version {document_version_id} to database '
f'for tenant {tenant_id} when creating embeddings. ' f'for tenant {tenant_id} when starting creating of embeddings. '
f'error: {e}') f'error: {e}')
return create_embeddings.update_state(state=states.FAILURE)
raise Ignore()
embed_provider = default_embedding_model.rsplit('.', 1)[0]
embed_model = default_embedding_model.rsplit('.', 1)[1]
# define embedding variables
embedding_function = None
match (embed_provider, embed_model):
case ('openai', 'text-embedding-3-small'):
embedding_function = embed_chunks_for_text_embedding_3_small
case ('mistral', 'mistral.mistral-embed'):
embedding_function = embed_chunks_for_mistral_embed
match document_version.file_type: match document_version.file_type:
case 'pdf': case 'pdf':
process_pdf(tenant_id, document_version, embedding_function, default_embedding_model) process_pdf(tenant, document_version)
case 'html': case 'html':
process_html(tenant_id, document_version, embedding_function, default_embedding_model) process_html(tenant, document_version)
case _: case _:
current_app.logger.info(f'No functionality defined for file type {document_version.file_type} ' current_app.logger.info(f'No functionality defined for file type {document_version.file_type} '
f'for tenant {tenant_id} ' f'for tenant {tenant_id} '
f'while creating embeddings for document version {document_version_id}') f'while creating embeddings for document version {document_version_id}')
create_embeddings.update_state(state=states.FAILURE)
raise Ignore()
@current_celery.task(name='ask_eve_ai', queue='llm_interactions') @current_celery.task(name='ask_eve_ai', queue='llm_interactions')
@@ -83,7 +87,7 @@ def ask_eve_ai(query):
pass pass
def process_pdf(tenant_id, document_version, embedding_function, embedding_model): def process_pdf(tenant, document_version):
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], file_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
document_version.file_location, document_version.file_location,
document_version.file_name) document_version.file_name)
@@ -97,102 +101,162 @@ def process_pdf(tenant_id, document_version, embedding_function, embedding_model
coordinates=True, coordinates=True,
extract_image_block_types=['Image', 'Table'], extract_image_block_types=['Image', 'Table'],
chunking_strategy='by_title', chunking_strategy='by_title',
combine_under_n_chars=2000, combine_under_n_chars=current_app.config.get('MIN_CHUNK_SIZE'),
max_characters=3000, max_characters=current_app.config.get('MAX_CHUNK_SIZE'),
) )
try: else:
chunks = partition_doc_unstructured(tenant_id, document_version, req) current_app.logger.error(f'The physical file for document version {document_version.id} '
enriched_chunk_docs = enrich_chunks(tenant_id, document_version, chunks) f'for tenant {tenant.id} '
embeddings = embedding_function(tenant_id, document_version, enriched_chunk_docs) f'at {file_path} does not exist')
except Exception as e: create_embeddings.update_state(state=states.FAILURE)
current_app.logger.error(f'Unable to create Embeddings for tenant {tenant_id} ' raise Ignore()
f'on document version {document_version.id} '
f'with model {embedding_model} '
f'error: {e}')
raise
# Save embeddings & processing information to the database try:
db.session.add_all(embeddings) chunks = partition_doc_unstructured(tenant, document_version, req)
except Exception as e:
current_app.logger.error(f'Unable to create Embeddings for tenant {tenant.id} '
f'while processing PDF on document version {document_version.id} '
f'error: {e}')
create_embeddings.update_state(state=states.FAILURE)
raise Ignore()
summary = summarize_chunk(tenant, document_version, chunks[0])
doc_lang = document_version.document_language
doc_lang.system_context = f'Summary: {summary}\n'
enriched_chunks = enrich_chunks(tenant, document_version, chunks)
embeddings = embed_chunks(tenant, document_version, enriched_chunks)
try:
db.session.add(doc_lang)
db.session.add(document_version) db.session.add(document_version)
document_version.processing_finished_at = dt.now(tz.utc) document_version.processing_finished_at = dt.now(tz.utc)
document_version.processing = False document_version.processing = False
db.session.add_all(embeddings)
try:
db.session.commit() db.session.commit()
except SQLAlchemyError as e: except SQLAlchemyError as e:
current_app.logger.error(f'Error saving embedding information for tenant {tenant_id} ' current_app.logger.error(f'Error saving embedding information for tenant {tenant.id} '
f'on document version {document_version.id}' f'on PDF, document version {document_version.id}'
f'error: {e}') f'error: {e}')
db.session.rollback() db.session.rollback()
raise raise
current_app.logger.info(f'Embeddings created successfully for tenant {tenant_id} ' current_app.logger.info(f'Embeddings created successfully for tenant {tenant.id} '
f'on document version {document_version.id} :-)') f'on document version {document_version.id} :-)')
else: # file exists
current_app.logger.error(f'The physical file for document version {document_version.id} '
f'at {file_path} does not exist')
raise
def process_html(tenant_id, document_version, embedding_function, default_embedding_model): def process_html(tenant, document_version):
# The tags to be considered can be dependent on the tenant
html_tags = tenant.html_tags
end_tags = tenant.html_end_tags
included_elements = tenant.html_included_elements
excluded_elements = tenant.html_excluded_elements
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], file_path = os.path.join(current_app.config['UPLOAD_FOLDER'],
document_version.file_location, document_version.file_location,
document_version.file_name) document_version.file_name)
if os.path.exists(file_path): if os.path.exists(file_path):
with open(file_path, 'rb') as f: with open(file_path, 'rb') as f:
html_content = f.read() html_content = f.read()
else:
current_app.logger.error(f'The physical file for document version {document_version.id} '
f'for tenant {tenant.id} '
f'at {file_path} does not exist')
create_embeddings.update_state(state=states.FAILURE)
raise Ignore()
extracted_data, title = parse_html(html_content, html_tags, included_elements=included_elements,
excluded_elements=excluded_elements)
def enrich_chunks(tenant_id, document_version, chunks): potential_chunks = create_potential_chunks(extracted_data, end_tags)
# We're adding filename and a summary of the first chunk to all the chunks to create global context chunks = combine_chunks(potential_chunks,
# using openAI to summarise current_app.config.get('MIN_CHUNK_SIZE'),
api_key = current_app.config.get('OPENAI_API_KEY') current_app.config.get('MAX_CHUNK_SIZE')
# TODO: model selection to be adapted to model approach )
llm = ChatOpenAI(api_key=api_key, temperature=0, model='gpt-4-turbo') summary = summarize_chunk(tenant, document_version, chunks[0])
summary_template = current_app.config.get('GPT4_SUMMARY_TEMPLATE')
prompt = ChatPromptTemplate.from_template(summary_template)
chain = load_summarize_chain(llm, chain_type='stuff', prompt=prompt)
doc_creator = CharacterTextSplitter(chunk_size=9000, chunk_overlap=0)
text_to_summarize = doc_creator.create_documents(chunks[0])
try:
summary = chain.run(text_to_summarize)
doc_lang = document_version.document_language doc_lang = document_version.document_language
db.session.add(doc_lang) doc_lang.system_context = (f'Title: {title}\n'
doc_lang.system_context = f'Summary:\n {summary}' f'Summary: {summary}\n')
enriched_chunks = enrich_chunks(tenant, document_version, chunks)
embeddings = embed_chunks(tenant, document_version, enriched_chunks)
try: try:
db.session.add(doc_lang)
db.session.add(document_version)
document_version.processing_finished_at = dt.now(tz.utc)
document_version.processing = False
db.session.add_all(embeddings)
db.session.commit() db.session.commit()
except SQLAlchemyError as e: except SQLAlchemyError as e:
current_app.logger. error(f'Error saving summary to DocumentLanguage {doc_lang.id} ' current_app.logger.error(f'Error saving embedding information for tenant {tenant.id} '
f'while enriching chunks for tenant {tenant_id} ' f'on HTML, document version {document_version.id}'
f'on document version {document_version.id} '
f'error: {e}') f'error: {e}')
db.session.rollback()
raise raise
chunk_global_context = (f'Filename: {doc_lang.document.name}\n' current_app.logger.info(f'Embeddings created successfully for tenant {tenant.id} '
f'User Context:\n{doc_lang.user_context}' f'on document version {document_version.id} :-)')
f'System Context:\n{summary}')
def enrich_chunks(tenant, document_version, chunks):
doc_lang = document_version.document_language
chunk_total_context = (f'Filename: {document_version.file_name}\n'
f'{doc_lang.system_context}\n'
f'User Context:\n{doc_lang.user_context}')
enriched_chunks = [] enriched_chunks = []
initial_chunk = f'Filename: {document_version.file_name}\n User Context:\n{doc_lang.user_context}\n{chunks[0]}' initial_chunk = f'Filename: {document_version.file_name}\n User Context:\n{doc_lang.user_context}\n{chunks[0]}'
enriched_chunks.append(initial_chunk) enriched_chunks.append(initial_chunk)
for chunk in chunks[1:]: for chunk in chunks[1:]:
enriched_chunk = f'{chunk_global_context}\n{chunk}' enriched_chunk = f'{chunk_total_context}\n{chunk}'
enriched_chunks.append(enriched_chunk) enriched_chunks.append(enriched_chunk)
return enriched_chunks return enriched_chunks
def summarize_chunk(tenant, document_version, chunk):
llm_model = tenant.llm_model
llm_provider = llm_model.split('.', 1)[0]
llm_model = llm_model.split('.', 1)[1]
summary_template = ''
llm = None
match llm_provider:
case 'openai':
api_key = current_app.config.get('OPENAI_API_KEY')
llm = ChatOpenAI(api_key=api_key, temperature=0, model=llm_model)
match llm_model:
case 'gpt-4-turbo':
summary_template = current_app.config.get('GPT4_SUMMARY_TEMPLATE')
case 'gpt-3.5-turbo':
summary_template = current_app.config.get('GPT3_5_SUMMARY_TEMPLATE')
case _:
current_app.logger.error(f'Error summarizing initial chunk for tenant {tenant.id} '
f'on document version {document_version.id} '
f'error: Invalid llm model')
create_embeddings.update_state(state=states.FAILURE)
raise Ignore()
case _:
current_app.logger.error(f'Error summarizing initial chunk for tenant {tenant.id} '
f'on document version {document_version.id} '
f'error: Invalid llm provider')
prompt = ChatPromptTemplate.from_template(summary_template)
chain = load_summarize_chain(llm, chain_type='stuff', prompt=prompt)
doc_creator = CharacterTextSplitter(chunk_size=current_app.config.get('MAX_CHUNK_SIZE') * 2, chunk_overlap=0)
text_to_summarize = doc_creator.create_documents(chunk)
try:
summary = chain.run(text_to_summarize)
except LangChainException as e: except LangChainException as e:
current_app.logger.error(f'Error creating summary for chunk enrichment for tenant {tenant_id} ' current_app.logger.error(f'Error creating summary for chunk enrichment for tenant {tenant.id} '
f'on document version {document_version.id} ' f'on document version {document_version.id} '
f'error: {e}') f'error: {e}')
raise raise
return summary
def partition_doc_unstructured(tenant_id, document_version, unstructured_request):
def partition_doc_unstructured(tenant, document_version, unstructured_request):
# Initiate the connection to unstructured.io # Initiate the connection to unstructured.io
url = current_app.config.get('UNSTRUCTURED_FULL_URL') url = current_app.config.get('UNSTRUCTURED_FULL_URL')
api_key = current_app.config.get('UNSTRUCTURED_API_KEY') api_key = current_app.config.get('UNSTRUCTURED_API_KEY')
@@ -211,20 +275,41 @@ def partition_doc_unstructured(tenant_id, document_version, unstructured_request
chunks.append(el['metadata']['text_as_html']) chunks.append(el['metadata']['text_as_html'])
return chunks return chunks
except SDKError as e: except SDKError as e:
current_app.logger.error(f'Error creating embeddings for tenant {tenant_id} ' current_app.logger.error(f'Error creating embeddings for tenant {tenant.id} '
f'on document version {document_version.id} while chuncking' f'on document version {document_version.id} while chuncking'
f'error: {e}') f'error: {e}')
raise raise
def embed_chunks_for_text_embedding_3_small(tenant_id, document_version, chunks): def embed_chunks(tenant, document_version, chunks):
embedding_provider = tenant.embedding_model.rsplit('.', 1)[0]
embedding_model = tenant.embedding_model.rsplit('.', 1)[1]
match embedding_provider:
case 'openai':
match embedding_model:
case 'text-embedding-3-small':
return embed_chunks_for_text_embedding_3_small(tenant, document_version, chunks)
case _:
current_app.logger.error(f'Error creating embeddings for tenant {tenant.id} '
f'on document version {document_version.id} '
f'error: Invalid embedding model')
create_embeddings.update_state(state=states.FAILURE)
raise Ignore()
case _:
current_app.logger.error(f'Error creating embeddings for tenant {tenant.id} '
f'on document version {document_version.id} '
f'error: Invalid embedding provider')
def embed_chunks_for_text_embedding_3_small(tenant, document_version, chunks):
# Create embedding vectors using OpenAI # Create embedding vectors using OpenAI
api_key = current_app.config.get('OPENAI_API_KEY') api_key = current_app.config.get('OPENAI_API_KEY')
embeddings_model = OpenAIEmbeddings(api_key=api_key, model='text-embedding-3-small') embeddings_model = OpenAIEmbeddings(api_key=api_key, model='text-embedding-3-small')
try: try:
embeddings = embeddings_model.embed_documents(chunks) embeddings = embeddings_model.embed_documents(chunks)
except LangChainException as e: except LangChainException as e:
current_app.logger.error(f'Error creating embeddings for tenant {tenant_id} ' current_app.logger.error(f'Error creating embeddings for tenant {tenant.id} '
f'on document version {document_version.id} while calling OpenAI API' f'on document version {document_version.id} while calling OpenAI API'
f'error: {e}') f'error: {e}')
raise raise
@@ -246,7 +331,7 @@ def embed_chunks_for_mistral_embed(tenant_id, document_version, chunks):
pass pass
def parse_html(html_content, included_elements=None, excluded_elements=None): def parse_html(html_content, tags, included_elements=None, excluded_elements=None):
soup = BeautifulSoup(html_content, 'html.parser') soup = BeautifulSoup(html_content, 'html.parser')
extracted_content = [] extracted_content = []
@@ -263,4 +348,57 @@ def parse_html(html_content, included_elements=None, excluded_elements=None):
continue # Skip this sub_element if it's within any of the excluded_elements continue # Skip this sub_element if it's within any of the excluded_elements
extracted_content.append((sub_element.name, sub_element.get_text(strip=True))) extracted_content.append((sub_element.name, sub_element.get_text(strip=True)))
return extracted_content title = soup.find('title').get_text(strip=True)
return extracted_content, title
def create_potential_chunks(extracted_data, end_tags):
potential_chunks = []
current_chunk = []
for tag, text in extracted_data:
formatted_text = f"- {text}" if tag == 'li' else f"{text}\n"
if current_chunk and tag in end_tags and current_chunk[-1][0] in end_tags:
# Consecutive li and p elements stay together
current_chunk.append((tag, formatted_text))
else:
# End the current chunk if the last element was an end tag
if current_chunk and current_chunk[-1][0] in end_tags:
potential_chunks.append(current_chunk)
current_chunk = []
current_chunk.append((tag, formatted_text))
# Add the last chunk
if current_chunk:
potential_chunks.append(current_chunk)
return potential_chunks
def combine_chunks(potential_chunks, min_chars, max_chars):
actual_chunks = []
current_chunk = ""
current_length = 0
for chunk in potential_chunks:
chunk_content = ''.join(text for _, text in chunk)
chunk_length = len(chunk_content)
if current_length + chunk_length > max_chars:
if current_length >= min_chars:
actual_chunks.append(current_chunk)
current_chunk = chunk_content
current_length = chunk_length
else:
# If the combined chunk is still less than max_chars, keep adding
current_chunk += chunk_content
current_length += chunk_length
else:
current_chunk += chunk_content
current_length += chunk_length
# Handle the last chunk
if current_chunk and current_length >= min_chars:
actual_chunks.append(current_chunk)
return actual_chunks