- Introduction of retrievers - Ensuring processing information is collected from Catalog iso Tenant - Introduction of a generic Form class to enable dynamic fields based on a configuration - Realisation of Retriever functionality to support dynamic fields
169 lines
7.1 KiB
Python
169 lines
7.1 KiB
Python
from common.extensions import db
|
|
from .user import User, Tenant
|
|
from pgvector.sqlalchemy import Vector
|
|
from sqlalchemy.dialects.postgresql import JSONB
|
|
from sqlalchemy.dialects.postgresql import ARRAY
|
|
import sqlalchemy as sa
|
|
|
|
|
|
class Catalog(db.Model):
|
|
id = db.Column(db.Integer, primary_key=True)
|
|
parent_id = db.Column(db.Integer, db.ForeignKey('catalog.id'), nullable=True)
|
|
name = db.Column(db.String(50), nullable=False)
|
|
description = db.Column(db.Text, nullable=True)
|
|
type = db.Column(db.String(50), nullable=False, default="DEFAULT_CATALOG")
|
|
|
|
# Embedding variables
|
|
html_tags = db.Column(ARRAY(sa.String(10)), nullable=True, default=['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'])
|
|
html_end_tags = db.Column(ARRAY(sa.String(10)), nullable=True, default=['p', 'li'])
|
|
html_included_elements = db.Column(ARRAY(sa.String(50)), nullable=True)
|
|
html_excluded_elements = db.Column(ARRAY(sa.String(50)), nullable=True)
|
|
html_excluded_classes = db.Column(ARRAY(sa.String(200)), nullable=True)
|
|
|
|
min_chunk_size = db.Column(db.Integer, nullable=True, default=2000)
|
|
max_chunk_size = db.Column(db.Integer, nullable=True, default=3000)
|
|
|
|
# Chat variables ==> Move to Specialist?
|
|
chat_RAG_temperature = db.Column(db.Float, nullable=True, default=0.3)
|
|
chat_no_RAG_temperature = db.Column(db.Float, nullable=True, default=0.5)
|
|
|
|
# Tuning enablers
|
|
embed_tuning = db.Column(db.Boolean, nullable=True, default=False)
|
|
|
|
# Meta Data
|
|
user_metadata = db.Column(JSONB, nullable=True)
|
|
system_metadata = db.Column(JSONB, nullable=True)
|
|
configuration = db.Column(JSONB, nullable=True)
|
|
|
|
# Versioning Information
|
|
created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now())
|
|
created_by = db.Column(db.Integer, db.ForeignKey(User.id), nullable=True)
|
|
updated_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now(), onupdate=db.func.now())
|
|
updated_by = db.Column(db.Integer, db.ForeignKey(User.id))
|
|
|
|
|
|
class Retriever(db.Model):
|
|
id = db.Column(db.Integer, primary_key=True)
|
|
name = db.Column(db.String(50), nullable=False)
|
|
description = db.Column(db.Text, nullable=True)
|
|
catalog_id = db.Column(db.Integer, db.ForeignKey('catalog.id'), nullable=True)
|
|
type = db.Column(db.String(50), nullable=False, default="DEFAULT_RAG")
|
|
|
|
# Meta Data
|
|
user_metadata = db.Column(JSONB, nullable=True)
|
|
system_metadata = db.Column(JSONB, nullable=True)
|
|
configuration = db.Column(JSONB, nullable=True)
|
|
|
|
# Versioning Information
|
|
created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now())
|
|
created_by = db.Column(db.Integer, db.ForeignKey(User.id), nullable=True)
|
|
updated_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now(), onupdate=db.func.now())
|
|
updated_by = db.Column(db.Integer, db.ForeignKey(User.id))
|
|
|
|
|
|
class Document(db.Model):
|
|
id = db.Column(db.Integer, primary_key=True)
|
|
# tenant_id = db.Column(db.Integer, db.ForeignKey(Tenant.id), nullable=False)
|
|
catalog_id = db.Column(db.Integer, db.ForeignKey(Catalog.id), nullable=True)
|
|
name = db.Column(db.String(100), nullable=False)
|
|
valid_from = db.Column(db.DateTime, nullable=True)
|
|
valid_to = db.Column(db.DateTime, nullable=True)
|
|
|
|
# Versioning Information
|
|
created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now())
|
|
created_by = db.Column(db.Integer, db.ForeignKey(User.id), nullable=True)
|
|
updated_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now(), onupdate=db.func.now())
|
|
updated_by = db.Column(db.Integer, db.ForeignKey(User.id))
|
|
|
|
# Relations
|
|
versions = db.relationship('DocumentVersion', backref='document', lazy=True)
|
|
|
|
def __repr__(self):
|
|
return f"<Document {self.id}: {self.name}>"
|
|
|
|
|
|
class DocumentVersion(db.Model):
|
|
id = db.Column(db.Integer, primary_key=True)
|
|
doc_id = db.Column(db.Integer, db.ForeignKey(Document.id), nullable=False)
|
|
url = db.Column(db.String(200), nullable=True)
|
|
bucket_name = db.Column(db.String(255), nullable=True)
|
|
object_name = db.Column(db.String(200), nullable=True)
|
|
file_type = db.Column(db.String(20), nullable=True)
|
|
file_size = db.Column(db.Float, nullable=True)
|
|
language = db.Column(db.String(2), nullable=False)
|
|
user_context = db.Column(db.Text, nullable=True)
|
|
system_context = db.Column(db.Text, nullable=True)
|
|
user_metadata = db.Column(JSONB, nullable=True)
|
|
system_metadata = db.Column(JSONB, nullable=True)
|
|
|
|
# Versioning Information
|
|
created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now())
|
|
created_by = db.Column(db.Integer, db.ForeignKey(User.id))
|
|
updated_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now(), onupdate=db.func.now())
|
|
updated_by = db.Column(db.Integer, db.ForeignKey(User.id))
|
|
|
|
# Processing Information
|
|
processing = db.Column(db.Boolean, nullable=False, default=False)
|
|
processing_started_at = db.Column(db.DateTime, nullable=True)
|
|
processing_finished_at = db.Column(db.DateTime, nullable=True)
|
|
processing_error = db.Column(db.String(255), nullable=True)
|
|
|
|
# Relations
|
|
embeddings = db.relationship('Embedding', backref='document_version', lazy=True)
|
|
|
|
def __repr__(self):
|
|
return f"<DocumentVersion {self.document_language.document_id}.{self.document_language.language}>.{self.id}>"
|
|
|
|
|
|
class Embedding(db.Model):
|
|
__tablename__ = 'embeddings'
|
|
id = db.Column(db.Integer, primary_key=True)
|
|
type = db.Column(db.String(30), nullable=False)
|
|
doc_vers_id = db.Column(db.Integer, db.ForeignKey(DocumentVersion.id), nullable=False)
|
|
active = db.Column(db.Boolean, nullable=False, default=True)
|
|
chunk = db.Column(db.Text, nullable=False)
|
|
|
|
__mapper_args__ = {
|
|
'polymorphic_identity': 'embedding',
|
|
'polymorphic_on': type
|
|
}
|
|
|
|
|
|
class EmbeddingMistral(Embedding):
|
|
__tablename__ = 'embedding_mistral'
|
|
id = db.Column(db.Integer, db.ForeignKey('embeddings.id'), primary_key=True)
|
|
|
|
# 1024 is the MISTRAL Embedding dimension.
|
|
# If another embedding model is chosen, this dimension may need to be changed.
|
|
embedding = db.Column(Vector(1024), nullable=False)
|
|
|
|
__mapper_args__ = {
|
|
'polymorphic_identity': 'embedding_mistral',
|
|
}
|
|
|
|
|
|
class EmbeddingSmallOpenAI(Embedding):
|
|
__tablename__ = 'embedding_small_openai'
|
|
id = db.Column(db.Integer, db.ForeignKey('embeddings.id'), primary_key=True)
|
|
|
|
# 1536 is the OpenAI Small Embedding dimension.
|
|
# If another embedding model is chosen, this dimension may need to be changed.
|
|
embedding = db.Column(Vector(1536), nullable=False)
|
|
|
|
__mapper_args__ = {
|
|
'polymorphic_identity': 'embedding_small_openai',
|
|
}
|
|
|
|
|
|
class EmbeddingLargeOpenAI(Embedding):
|
|
__tablename__ = 'embedding_large_openai'
|
|
id = db.Column(db.Integer, db.ForeignKey('embeddings.id'), primary_key=True)
|
|
|
|
# 3072 is the OpenAI Large Embedding dimension.
|
|
# If another embedding model is chosen, this dimension may need to be changed.
|
|
embedding = db.Column(Vector(3072), nullable=False)
|
|
|
|
__mapper_args__ = {
|
|
'polymorphic_identity': 'embedding_large_openai',
|
|
}
|