- Adapt Sync Wordpress Component to Catalog introduction

- Small bug fixes
This commit is contained in:
Josako
2024-10-17 10:31:13 +02:00
parent 7f12c8b355
commit 74cc7ae95e
12 changed files with 94 additions and 30 deletions

View File

@@ -15,6 +15,7 @@ class HTMLProcessor(Processor):
self.html_end_tags = model_variables['html_end_tags']
self.html_included_elements = model_variables['html_included_elements']
self.html_excluded_elements = model_variables['html_excluded_elements']
self.html_excluded_classes = model_variables['html_excluded_classes']
self.chunk_size = model_variables['processing_chunk_size'] # Adjust this based on your LLM's optimal input size
self.chunk_overlap = model_variables[
'processing_chunk_overlap'] # Adjust for context preservation between chunks
@@ -45,7 +46,7 @@ class HTMLProcessor(Processor):
self._log(f'Parsing HTML for tenant {self.tenant.id}')
soup = BeautifulSoup(html_content, 'html.parser')
extracted_html = ''
excluded_classes = self._parse_excluded_classes(self.tenant.html_excluded_classes)
excluded_classes = self._parse_excluded_classes(self.html_excluded_classes)
if self.html_included_elements:
elements_to_parse = soup.find_all(self.html_included_elements)