- Addition of general chunking parameters chunking_heading_level and chunking patterns

- Addition of Processor types docx and markdown
This commit is contained in:
Josako
2024-12-05 15:19:37 +01:00
parent 311927d5ea
commit d35ec9f5ae
17 changed files with 718 additions and 66 deletions

View File

@@ -24,7 +24,7 @@ class HTMLProcessor(BaseProcessor):
# Add verification logging
self._log(f"HTML Processor initialized with tuning={self.tuning}")
if self.tuning:
self._log_tuning("HTML Processor initialized", {
self.log_tuning("HTML Processor initialized", {
"html_tags": self.html_tags,
"html_end_tags": self.html_end_tags,
"included_elements": self.html_included_elements,
@@ -75,7 +75,7 @@ class HTMLProcessor(BaseProcessor):
title = soup.find('title').get_text(strip=True) if soup.find('title') else ''
self._log(f'Finished parsing HTML for tenant {self.tenant.id}')
self._log_tuning("_parse_html", {"extracted_html": extracted_html, "title": title})
self.log_tuning("_parse_html", {"extracted_html": extracted_html, "title": title})
return extracted_html, title
def _generate_markdown_from_html(self, html_content):
@@ -96,7 +96,7 @@ class HTMLProcessor(BaseProcessor):
input_html = {"html": chunk}
markdown_chunk = chain.invoke(input_html)
markdown_chunks.append(markdown_chunk)
self._log_tuning("_generate_markdown_from_html", {"chunk": chunk, "markdown_chunk": markdown_chunk})
self.log_tuning("_generate_markdown_from_html", {"chunk": chunk, "markdown_chunk": markdown_chunk})
markdown = "\n\n".join(markdown_chunks)
self._log(f'Finished generating markdown from HTML for tenant {self.tenant.id}')