- Addition of general chunking parameters chunking_heading_level and chunking patterns

- Addition of Processor types docx and markdown
This commit is contained in:
Josako
2024-12-05 15:19:37 +01:00
parent 311927d5ea
commit d35ec9f5ae
17 changed files with 718 additions and 66 deletions

View File

@@ -57,7 +57,7 @@ class PDFProcessor(BaseProcessor):
'figures': self._extract_figures(page, page_num, figure_counter),
'tables': self._extract_tables(page)
}
self._log_tuning("_extract_content", {"page_num": page_num, "page_content": page_content})
self.log_tuning("_extract_content", {"page_num": page_num, "page_content": page_content})
figure_counter += len(page_content['figures'])
extracted_content.append(page_content)
@@ -119,7 +119,7 @@ class PDFProcessor(BaseProcessor):
markdown_table = self._table_to_markdown(table)
if markdown_table: # Only add non-empty tables
tables.append(markdown_table)
self._log_tuning("_extract_tables", {"markdown_table": markdown_table})
self.log_tuning("_extract_tables", {"markdown_table": markdown_table})
except Exception as e:
self._log(f"Error extracting tables from page: {str(e)}", level='error')
return tables