- Addition of general chunking parameters chunking_heading_level and chunking patterns

- Addition of Processor types docx and markdown
This commit is contained in:
Josako
2024-12-05 15:19:37 +01:00
parent 311927d5ea
commit d35ec9f5ae
17 changed files with 718 additions and 66 deletions

View File

@@ -45,7 +45,7 @@ class TranscriptionBaseProcessor(BaseProcessor):
return text_splitter.split_text(transcription)
def _process_chunks(self, chunks):
self._log_tuning("_process_chunks", {"Nr of Chunks": len(chunks)})
self.log_tuning("_process_chunks", {"Nr of Chunks": len(chunks)})
llm = self.model_variables.get_llm()
template = self.model_variables.get_template('transcript')
language_template = create_language_template(template, self.document_version.language)
@@ -64,7 +64,7 @@ class TranscriptionBaseProcessor(BaseProcessor):
}
markdown = chain.invoke(input_transcript)
markdown = self._clean_markdown(markdown)
self._log_tuning("_process_chunks", {
self.log_tuning("_process_chunks", {
"Chunk Number": f"{i + 1} of {len(chunks)}",
"Chunk": chunk,
"Previous Chunk": previous_part,