- Addition of general chunking parameters chunking_heading_level and chunking patterns
- Addition of Processor types docx and markdown
This commit is contained in:
@@ -24,7 +24,7 @@ class HTMLProcessor(BaseProcessor):
|
||||
# Add verification logging
|
||||
self._log(f"HTML Processor initialized with tuning={self.tuning}")
|
||||
if self.tuning:
|
||||
self._log_tuning("HTML Processor initialized", {
|
||||
self.log_tuning("HTML Processor initialized", {
|
||||
"html_tags": self.html_tags,
|
||||
"html_end_tags": self.html_end_tags,
|
||||
"included_elements": self.html_included_elements,
|
||||
@@ -75,7 +75,7 @@ class HTMLProcessor(BaseProcessor):
|
||||
title = soup.find('title').get_text(strip=True) if soup.find('title') else ''
|
||||
|
||||
self._log(f'Finished parsing HTML for tenant {self.tenant.id}')
|
||||
self._log_tuning("_parse_html", {"extracted_html": extracted_html, "title": title})
|
||||
self.log_tuning("_parse_html", {"extracted_html": extracted_html, "title": title})
|
||||
return extracted_html, title
|
||||
|
||||
def _generate_markdown_from_html(self, html_content):
|
||||
@@ -96,7 +96,7 @@ class HTMLProcessor(BaseProcessor):
|
||||
input_html = {"html": chunk}
|
||||
markdown_chunk = chain.invoke(input_html)
|
||||
markdown_chunks.append(markdown_chunk)
|
||||
self._log_tuning("_generate_markdown_from_html", {"chunk": chunk, "markdown_chunk": markdown_chunk})
|
||||
self.log_tuning("_generate_markdown_from_html", {"chunk": chunk, "markdown_chunk": markdown_chunk})
|
||||
|
||||
markdown = "\n\n".join(markdown_chunks)
|
||||
self._log(f'Finished generating markdown from HTML for tenant {self.tenant.id}')
|
||||
|
||||
Reference in New Issue
Block a user