- Tuning moved to Retriever iso in the configuration, as this is an attribute that should be available for all types of Retrievers
170 lines
7.1 KiB
Python
170 lines
7.1 KiB
Python
from common.extensions import db
|
|
from .user import User, Tenant
|
|
from pgvector.sqlalchemy import Vector
|
|
from sqlalchemy.dialects.postgresql import JSONB
|
|
from sqlalchemy.dialects.postgresql import ARRAY
|
|
import sqlalchemy as sa
|
|
|
|
|
|
class Catalog(db.Model):
|
|
id = db.Column(db.Integer, primary_key=True)
|
|
name = db.Column(db.String(50), nullable=False)
|
|
description = db.Column(db.Text, nullable=True)
|
|
type = db.Column(db.String(50), nullable=False, default="DEFAULT_CATALOG")
|
|
|
|
# Embedding variables
|
|
html_tags = db.Column(ARRAY(sa.String(10)), nullable=True, default=['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'])
|
|
html_end_tags = db.Column(ARRAY(sa.String(10)), nullable=True, default=['p', 'li'])
|
|
html_included_elements = db.Column(ARRAY(sa.String(50)), nullable=True)
|
|
html_excluded_elements = db.Column(ARRAY(sa.String(50)), nullable=True)
|
|
html_excluded_classes = db.Column(ARRAY(sa.String(200)), nullable=True)
|
|
|
|
min_chunk_size = db.Column(db.Integer, nullable=True, default=2000)
|
|
max_chunk_size = db.Column(db.Integer, nullable=True, default=3000)
|
|
|
|
# Chat variables ==> Move to Specialist?
|
|
chat_RAG_temperature = db.Column(db.Float, nullable=True, default=0.3)
|
|
chat_no_RAG_temperature = db.Column(db.Float, nullable=True, default=0.5)
|
|
|
|
# Tuning enablers
|
|
embed_tuning = db.Column(db.Boolean, nullable=True, default=False)
|
|
|
|
# Meta Data
|
|
user_metadata = db.Column(JSONB, nullable=True)
|
|
system_metadata = db.Column(JSONB, nullable=True)
|
|
configuration = db.Column(JSONB, nullable=True)
|
|
|
|
# Versioning Information
|
|
created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now())
|
|
created_by = db.Column(db.Integer, db.ForeignKey(User.id), nullable=True)
|
|
updated_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now(), onupdate=db.func.now())
|
|
updated_by = db.Column(db.Integer, db.ForeignKey(User.id))
|
|
|
|
|
|
class Retriever(db.Model):
|
|
id = db.Column(db.Integer, primary_key=True)
|
|
name = db.Column(db.String(50), nullable=False)
|
|
description = db.Column(db.Text, nullable=True)
|
|
catalog_id = db.Column(db.Integer, db.ForeignKey('catalog.id'), nullable=True)
|
|
type = db.Column(db.String(50), nullable=False, default="DEFAULT_RAG")
|
|
tuning = db.Column(db.Boolean, nullable=True, default=False)
|
|
|
|
# Meta Data
|
|
user_metadata = db.Column(JSONB, nullable=True)
|
|
system_metadata = db.Column(JSONB, nullable=True)
|
|
configuration = db.Column(JSONB, nullable=True)
|
|
|
|
# Versioning Information
|
|
created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now())
|
|
created_by = db.Column(db.Integer, db.ForeignKey(User.id), nullable=True)
|
|
updated_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now(), onupdate=db.func.now())
|
|
updated_by = db.Column(db.Integer, db.ForeignKey(User.id))
|
|
|
|
|
|
class Document(db.Model):
|
|
id = db.Column(db.Integer, primary_key=True)
|
|
# tenant_id = db.Column(db.Integer, db.ForeignKey(Tenant.id), nullable=False)
|
|
catalog_id = db.Column(db.Integer, db.ForeignKey(Catalog.id), nullable=True)
|
|
name = db.Column(db.String(100), nullable=False)
|
|
valid_from = db.Column(db.DateTime, nullable=True)
|
|
valid_to = db.Column(db.DateTime, nullable=True)
|
|
|
|
# Versioning Information
|
|
created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now())
|
|
created_by = db.Column(db.Integer, db.ForeignKey(User.id), nullable=True)
|
|
updated_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now(), onupdate=db.func.now())
|
|
updated_by = db.Column(db.Integer, db.ForeignKey(User.id))
|
|
|
|
# Relations
|
|
versions = db.relationship('DocumentVersion', backref='document', lazy=True)
|
|
|
|
def __repr__(self):
|
|
return f"<Document {self.id}: {self.name}>"
|
|
|
|
|
|
class DocumentVersion(db.Model):
|
|
id = db.Column(db.Integer, primary_key=True)
|
|
doc_id = db.Column(db.Integer, db.ForeignKey(Document.id), nullable=False)
|
|
url = db.Column(db.String(200), nullable=True)
|
|
bucket_name = db.Column(db.String(255), nullable=True)
|
|
object_name = db.Column(db.String(200), nullable=True)
|
|
file_type = db.Column(db.String(20), nullable=True)
|
|
file_size = db.Column(db.Float, nullable=True)
|
|
language = db.Column(db.String(2), nullable=False)
|
|
user_context = db.Column(db.Text, nullable=True)
|
|
system_context = db.Column(db.Text, nullable=True)
|
|
user_metadata = db.Column(JSONB, nullable=True)
|
|
system_metadata = db.Column(JSONB, nullable=True)
|
|
catalog_properties = db.Column(JSONB, nullable=True)
|
|
|
|
# Versioning Information
|
|
created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now())
|
|
created_by = db.Column(db.Integer, db.ForeignKey(User.id))
|
|
updated_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now(), onupdate=db.func.now())
|
|
updated_by = db.Column(db.Integer, db.ForeignKey(User.id))
|
|
|
|
# Processing Information
|
|
processing = db.Column(db.Boolean, nullable=False, default=False)
|
|
processing_started_at = db.Column(db.DateTime, nullable=True)
|
|
processing_finished_at = db.Column(db.DateTime, nullable=True)
|
|
processing_error = db.Column(db.String(255), nullable=True)
|
|
|
|
# Relations
|
|
embeddings = db.relationship('Embedding', backref='document_version', lazy=True)
|
|
|
|
def __repr__(self):
|
|
return f"<DocumentVersion {self.document_language.document_id}.{self.document_language.language}>.{self.id}>"
|
|
|
|
|
|
class Embedding(db.Model):
|
|
__tablename__ = 'embeddings'
|
|
id = db.Column(db.Integer, primary_key=True)
|
|
type = db.Column(db.String(30), nullable=False)
|
|
doc_vers_id = db.Column(db.Integer, db.ForeignKey(DocumentVersion.id), nullable=False)
|
|
active = db.Column(db.Boolean, nullable=False, default=True)
|
|
chunk = db.Column(db.Text, nullable=False)
|
|
|
|
__mapper_args__ = {
|
|
'polymorphic_identity': 'embedding',
|
|
'polymorphic_on': type
|
|
}
|
|
|
|
|
|
class EmbeddingMistral(Embedding):
|
|
__tablename__ = 'embedding_mistral'
|
|
id = db.Column(db.Integer, db.ForeignKey('embeddings.id'), primary_key=True)
|
|
|
|
# 1024 is the MISTRAL Embedding dimension.
|
|
# If another embedding model is chosen, this dimension may need to be changed.
|
|
embedding = db.Column(Vector(1024), nullable=False)
|
|
|
|
__mapper_args__ = {
|
|
'polymorphic_identity': 'embedding_mistral',
|
|
}
|
|
|
|
|
|
class EmbeddingSmallOpenAI(Embedding):
|
|
__tablename__ = 'embedding_small_openai'
|
|
id = db.Column(db.Integer, db.ForeignKey('embeddings.id'), primary_key=True)
|
|
|
|
# 1536 is the OpenAI Small Embedding dimension.
|
|
# If another embedding model is chosen, this dimension may need to be changed.
|
|
embedding = db.Column(Vector(1536), nullable=False)
|
|
|
|
__mapper_args__ = {
|
|
'polymorphic_identity': 'embedding_small_openai',
|
|
}
|
|
|
|
|
|
class EmbeddingLargeOpenAI(Embedding):
|
|
__tablename__ = 'embedding_large_openai'
|
|
id = db.Column(db.Integer, db.ForeignKey('embeddings.id'), primary_key=True)
|
|
|
|
# 3072 is the OpenAI Large Embedding dimension.
|
|
# If another embedding model is chosen, this dimension may need to be changed.
|
|
embedding = db.Column(Vector(3072), nullable=False)
|
|
|
|
__mapper_args__ = {
|
|
'polymorphic_identity': 'embedding_large_openai',
|
|
}
|