- Addition of general chunking parameters chunking_heading_level and chunking patterns

- Addition of Processor types docx and markdown
This commit is contained in:
Josako
2024-12-05 15:19:37 +01:00
parent 311927d5ea
commit d35ec9f5ae
17 changed files with 718 additions and 66 deletions

View File

@@ -46,7 +46,7 @@ class AudioProcessor(TranscriptionBaseProcessor):
try:
audio_info = AudioSegment.from_file(temp_file_path, format=self.document_version.file_type)
total_duration = len(audio_info)
self._log_tuning("_compress_audio", {
self.log_tuning("_compress_audio", {
"Audio Duration (ms)": total_duration,
})
segment_length = self.max_compression_duration * 1000 # Convert to milliseconds
@@ -55,7 +55,7 @@ class AudioProcessor(TranscriptionBaseProcessor):
compressed_segments = AudioSegment.empty()
for i in range(total_chunks):
self._log_tuning("_compress_audio", {
self.log_tuning("_compress_audio", {
"Segment Nr": f"{i + 1} of {total_chunks}"
})
@@ -87,7 +87,7 @@ class AudioProcessor(TranscriptionBaseProcessor):
compressed_filename,
compressed_buffer.read()
)
self._log_tuning("_compress_audio", {
self.log_tuning("_compress_audio", {
"Compressed audio to MinIO": compressed_filename
})
@@ -172,14 +172,14 @@ class AudioProcessor(TranscriptionBaseProcessor):
transcriptions.append(trans)
self._log_tuning("_transcribe_audio", {
self.log_tuning("_transcribe_audio", {
"Chunk Nr": f"{i + 1} of {total_chunks}",
"Segment Duration": segment_duration,
"Transcription": trans,
})
else:
self._log("Warning: Received empty transcription", level='warning')
self._log_tuning("_transcribe_audio", {"ERROR": "No transcription"})
self.log_tuning("_transcribe_audio", {"ERROR": "No transcription"})
except Exception as e:
self._log(f"Error during transcription: {str(e)}", level='error')
@@ -202,7 +202,7 @@ class AudioProcessor(TranscriptionBaseProcessor):
transcription_filename,
full_transcription.encode('utf-8')
)
self._log_tuning(f"Saved transcription to MinIO: {transcription_filename}")
self.log_tuning(f"Saved transcription to MinIO: {transcription_filename}")
return full_transcription