- Improvements on audio processing to limit CPU and memory usage

- Removed Portkey from the equation, and defined explicit monitoring using Langchain native code
- Optimization of Business Event logging
This commit is contained in:
Josako
2024-10-02 14:11:46 +02:00
parent 883175b8f5
commit b700cfac64
13 changed files with 450 additions and 228 deletions

View File

@@ -0,0 +1,49 @@
import time
from langchain.callbacks.base import BaseCallbackHandler
from typing import Dict, Any, List
from langchain.schema import LLMResult
from common.utils.business_event_context import current_event
from flask import current_app
class LLMMetricsHandler(BaseCallbackHandler):
def __init__(self):
self.total_tokens: int = 0
self.prompt_tokens: int = 0
self.completion_tokens: int = 0
self.start_time: float = 0
self.end_time: float = 0
self.total_time: float = 0
def reset(self):
self.total_tokens = 0
self.prompt_tokens = 0
self.completion_tokens = 0
self.start_time = 0
self.end_time = 0
self.total_time = 0
def on_llm_start(self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any) -> None:
self.start_time = time.time()
def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
self.end_time = time.time()
self.total_time = self.end_time - self.start_time
usage = response.llm_output.get('token_usage', {})
self.prompt_tokens += usage.get('prompt_tokens', 0)
self.completion_tokens += usage.get('completion_tokens', 0)
self.total_tokens = self.prompt_tokens + self.completion_tokens
metrics = self.get_metrics()
current_event.log_llm_metrics(metrics)
self.reset() # Reset for the next call
def get_metrics(self) -> Dict[str, int | float]:
return {
'total_tokens': self.total_tokens,
'prompt_tokens': self.prompt_tokens,
'completion_tokens': self.completion_tokens,
'time_elapsed': self.total_time,
'interaction_type': 'LLM',
}

View File

@@ -0,0 +1,51 @@
from langchain_openai import OpenAIEmbeddings
from typing import List, Any
import time
from common.utils.business_event_context import current_event
class TrackedOpenAIEmbeddings(OpenAIEmbeddings):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def embed_documents(self, texts: list[str]) -> list[list[float]]:
start_time = time.time()
result = super().embed_documents(texts)
end_time = time.time()
# Estimate token usage (OpenAI uses tiktoken for this)
import tiktoken
enc = tiktoken.encoding_for_model(self.model)
total_tokens = sum(len(enc.encode(text)) for text in texts)
metrics = {
'total_tokens': total_tokens,
'prompt_tokens': total_tokens, # For embeddings, all tokens are prompt tokens
'completion_tokens': 0,
'time_elapsed': end_time - start_time,
'interaction_type': 'Embedding',
}
current_event.log_llm_metrics(metrics)
return result
def embed_query(self, text: str) -> List[float]:
start_time = time.time()
result = super().embed_query(text)
end_time = time.time()
# Estimate token usage
import tiktoken
enc = tiktoken.encoding_for_model(self.model)
total_tokens = len(enc.encode(text))
metrics = {
'total_tokens': total_tokens,
'prompt_tokens': total_tokens,
'completion_tokens': 0,
'time_elapsed': end_time - start_time,
'interaction_type': 'Embedding',
}
current_event.log_llm_metrics(metrics)
return result

View File

@@ -0,0 +1,27 @@
import time
from common.utils.business_event_context import current_event
def tracked_transcribe(client, *args, **kwargs):
start_time = time.time()
# Extract the file and model from kwargs if present, otherwise use defaults
file = kwargs.get('file')
model = kwargs.get('model', 'whisper-1')
duration = kwargs.pop('duration', 600)
result = client.audio.transcriptions.create(*args, **kwargs)
end_time = time.time()
# Token usage for transcriptions is actually the duration in seconds we pass, as the whisper model is priced per second transcribed
metrics = {
'total_tokens': duration,
'prompt_tokens': 0, # For transcriptions, all tokens are considered "completion"
'completion_tokens': duration,
'time_elapsed': end_time - start_time,
'interaction_type': 'ASR',
}
current_event.log_llm_metrics(metrics)
return result