Files
eveAI/common/models/document.py
2025-07-14 18:58:54 +02:00

204 lines
8.4 KiB
Python

from common.extensions import db
from .user import User, Tenant
from pgvector.sqlalchemy import Vector
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.dialects.postgresql import ARRAY
import sqlalchemy as sa
class Catalog(db.Model):
id = db.Column(db.Integer, primary_key=True)
name = db.Column(db.String(50), nullable=False, unique=True)
description = db.Column(db.Text, nullable=True)
type = db.Column(db.String(50), nullable=False, default="STANDARD_CATALOG")
type_version = db.Column(db.String(20), nullable=True, default="1.0.0")
min_chunk_size = db.Column(db.Integer, nullable=True, default=1500)
max_chunk_size = db.Column(db.Integer, nullable=True, default=2500)
# Meta Data
user_metadata = db.Column(JSONB, nullable=True)
system_metadata = db.Column(JSONB, nullable=True)
configuration = db.Column(JSONB, nullable=True)
# Versioning Information
created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now())
created_by = db.Column(db.Integer, db.ForeignKey(User.id), nullable=True)
updated_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now(), onupdate=db.func.now())
updated_by = db.Column(db.Integer, db.ForeignKey(User.id))
def to_dict(self):
return {
'id': self.id,
'name': self.name,
'description': self.description,
'type': self.type,
'type_version': self.type_version,
'min_chunk_size': self.min_chunk_size,
'max_chunk_size': self.max_chunk_size,
'user_metadata': self.user_metadata,
'system_metadata': self.system_metadata,
'configuration': self.configuration,
}
class Processor(db.Model):
id = db.Column(db.Integer, primary_key=True)
name = db.Column(db.String(50), nullable=False)
description = db.Column(db.Text, nullable=True)
catalog_id = db.Column(db.Integer, db.ForeignKey('catalog.id'), nullable=True)
type = db.Column(db.String(50), nullable=False)
sub_file_type = db.Column(db.String(50), nullable=True)
active = db.Column(db.Boolean, nullable=True, default=True)
# Tuning enablers
tuning = db.Column(db.Boolean, nullable=True, default=False)
# Meta Data
user_metadata = db.Column(JSONB, nullable=True)
system_metadata = db.Column(JSONB, nullable=True)
configuration = db.Column(JSONB, nullable=True)
# Versioning Information
created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now())
created_by = db.Column(db.Integer, db.ForeignKey(User.id), nullable=True)
updated_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now(), onupdate=db.func.now())
updated_by = db.Column(db.Integer, db.ForeignKey(User.id))
class Retriever(db.Model):
id = db.Column(db.Integer, primary_key=True)
name = db.Column(db.String(50), nullable=False)
description = db.Column(db.Text, nullable=True)
catalog_id = db.Column(db.Integer, db.ForeignKey('catalog.id'), nullable=True)
type = db.Column(db.String(50), nullable=False, default="STANDARD_RAG")
type_version = db.Column(db.String(20), nullable=True, default="STANDARD_RAG")
tuning = db.Column(db.Boolean, nullable=True, default=False)
# Meta Data
user_metadata = db.Column(JSONB, nullable=True)
system_metadata = db.Column(JSONB, nullable=True)
configuration = db.Column(JSONB, nullable=True)
arguments = db.Column(JSONB, nullable=True)
# Versioning Information
created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now())
created_by = db.Column(db.Integer, db.ForeignKey(User.id), nullable=True)
updated_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now(), onupdate=db.func.now())
updated_by = db.Column(db.Integer, db.ForeignKey(User.id))
class Document(db.Model):
id = db.Column(db.Integer, primary_key=True)
# tenant_id = db.Column(db.Integer, db.ForeignKey(Tenant.id), nullable=False)
catalog_id = db.Column(db.Integer, db.ForeignKey(Catalog.id), nullable=True)
name = db.Column(db.String(100), nullable=False)
valid_from = db.Column(db.DateTime, nullable=True)
valid_to = db.Column(db.DateTime, nullable=True)
# Versioning Information
created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now())
created_by = db.Column(db.Integer, db.ForeignKey(User.id), nullable=True)
updated_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now(), onupdate=db.func.now())
updated_by = db.Column(db.Integer, db.ForeignKey(User.id))
# Relations
versions = db.relationship('DocumentVersion', backref='document', lazy=True)
@property
def latest_version(self):
"""Returns the latest document version (the one with highest id)"""
from sqlalchemy import desc
return DocumentVersion.query.filter_by(doc_id=self.id).order_by(desc(DocumentVersion.id)).first()
def __repr__(self):
return f"<Document {self.id}: {self.name}>"
class DocumentVersion(db.Model):
id = db.Column(db.Integer, primary_key=True)
doc_id = db.Column(db.Integer, db.ForeignKey(Document.id), nullable=False)
url = db.Column(db.String(200), nullable=True)
bucket_name = db.Column(db.String(255), nullable=True)
object_name = db.Column(db.String(200), nullable=True)
file_type = db.Column(db.String(20), nullable=True)
sub_file_type = db.Column(db.String(50), nullable=True)
file_size = db.Column(db.Float, nullable=True)
language = db.Column(db.String(2), nullable=False)
user_context = db.Column(db.Text, nullable=True)
system_context = db.Column(db.Text, nullable=True)
user_metadata = db.Column(JSONB, nullable=True)
system_metadata = db.Column(JSONB, nullable=True)
catalog_properties = db.Column(JSONB, nullable=True)
# Versioning Information
created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now())
created_by = db.Column(db.Integer, db.ForeignKey(User.id))
updated_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now(), onupdate=db.func.now())
updated_by = db.Column(db.Integer, db.ForeignKey(User.id))
# Processing Information
processing = db.Column(db.Boolean, nullable=False, default=False)
processing_started_at = db.Column(db.DateTime, nullable=True)
processing_finished_at = db.Column(db.DateTime, nullable=True)
processing_error = db.Column(db.String(255), nullable=True)
# Relations
embeddings = db.relationship('Embedding', backref='document_version', lazy=True)
def __repr__(self):
return f"<DocumentVersion {self.document_language.document_id}.{self.document_language.language}>.{self.id}>"
class Embedding(db.Model):
__tablename__ = 'embeddings'
id = db.Column(db.Integer, primary_key=True)
type = db.Column(db.String(30), nullable=False)
doc_vers_id = db.Column(db.Integer, db.ForeignKey(DocumentVersion.id), nullable=False)
active = db.Column(db.Boolean, nullable=False, default=True)
chunk = db.Column(db.Text, nullable=False)
__mapper_args__ = {
'polymorphic_identity': 'embedding',
'polymorphic_on': type
}
class EmbeddingMistral(Embedding):
__tablename__ = 'embedding_mistral'
id = db.Column(db.Integer, db.ForeignKey('embeddings.id'), primary_key=True)
# 1024 is the MISTRAL Embedding dimension.
# If another embedding model is chosen, this dimension may need to be changed.
embedding = db.Column(Vector(1024), nullable=False)
__mapper_args__ = {
'polymorphic_identity': 'embedding_mistral',
}
class EmbeddingSmallOpenAI(Embedding):
__tablename__ = 'embedding_small_openai'
id = db.Column(db.Integer, db.ForeignKey('embeddings.id'), primary_key=True)
# 1536 is the OpenAI Small Embedding dimension.
# If another embedding model is chosen, this dimension may need to be changed.
embedding = db.Column(Vector(1536), nullable=False)
__mapper_args__ = {
'polymorphic_identity': 'embedding_small_openai',
}
class EmbeddingLargeOpenAI(Embedding):
__tablename__ = 'embedding_large_openai'
id = db.Column(db.Integer, db.ForeignKey('embeddings.id'), primary_key=True)
# 3072 is the OpenAI Large Embedding dimension.
# If another embedding model is chosen, this dimension may need to be changed.
embedding = db.Column(Vector(3072), nullable=False)
__mapper_args__ = {
'polymorphic_identity': 'embedding_large_openai',
}