- Addition of general chunking parameters chunking_heading_level and chunking patterns

- Addition of Processor types docx and markdown
2024-12-05 15:19:37 +01:00
parent 311927d5ea
commit d35ec9f5ae
17 changed files with 718 additions and 66 deletions
--- a/eveai_workers/processors/html_processor.py
+++ b/eveai_workers/processors/html_processor.py
@@ -24,7 +24,7 @@ class HTMLProcessor(BaseProcessor):
        # Add verification logging
        self._log(f"HTML Processor initialized with tuning={self.tuning}")
        if self.tuning:
-            self._log_tuning("HTML Processor initialized", {
+            self.log_tuning("HTML Processor initialized", {
                "html_tags": self.html_tags,
                "html_end_tags": self.html_end_tags,
                "included_elements": self.html_included_elements,
@@ -75,7 +75,7 @@ class HTMLProcessor(BaseProcessor):
        title = soup.find('title').get_text(strip=True) if soup.find('title') else ''

        self._log(f'Finished parsing HTML for tenant {self.tenant.id}')
-        self._log_tuning("_parse_html", {"extracted_html": extracted_html, "title": title})
+        self.log_tuning("_parse_html", {"extracted_html": extracted_html, "title": title})
        return extracted_html, title

    def _generate_markdown_from_html(self, html_content):
@@ -96,7 +96,7 @@ class HTMLProcessor(BaseProcessor):
            input_html = {"html": chunk}
            markdown_chunk = chain.invoke(input_html)
            markdown_chunks.append(markdown_chunk)
-            self._log_tuning("_generate_markdown_from_html", {"chunk": chunk, "markdown_chunk": markdown_chunk})
+            self.log_tuning("_generate_markdown_from_html", {"chunk": chunk, "markdown_chunk": markdown_chunk})

        markdown = "\n\n".join(markdown_chunks)
        self._log(f'Finished generating markdown from HTML for tenant {self.tenant.id}')