- Introduction of API-functionality (to be continued). Deduplication of document and url uploads between views and api. - Improvements on document processing - introduction of processor classes to streamline document inputs - Removed pure Youtube functionality, as Youtube retrieval of documents continuously changes. But added upload of srt, mp3, ogg and mp4
113 lines
4.3 KiB
Python
113 lines
4.3 KiB
Python
from common.extensions import db
|
|
from .user import User, Tenant
|
|
from pgvector.sqlalchemy import Vector
|
|
|
|
|
|
class Document(db.Model):
|
|
id = db.Column(db.Integer, primary_key=True)
|
|
name = db.Column(db.String(100), nullable=False)
|
|
tenant_id = db.Column(db.Integer, db.ForeignKey(Tenant.id), nullable=False)
|
|
valid_from = db.Column(db.DateTime, nullable=True)
|
|
valid_to = db.Column(db.DateTime, nullable=True)
|
|
|
|
# Versioning Information
|
|
created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now())
|
|
created_by = db.Column(db.Integer, db.ForeignKey(User.id), nullable=True)
|
|
updated_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now(), onupdate=db.func.now())
|
|
updated_by = db.Column(db.Integer, db.ForeignKey(User.id))
|
|
|
|
# Relations
|
|
versions = db.relationship('DocumentVersion', backref='document', lazy=True)
|
|
|
|
def __repr__(self):
|
|
return f"<Document {self.id}: {self.name}>"
|
|
|
|
|
|
class DocumentVersion(db.Model):
|
|
id = db.Column(db.Integer, primary_key=True)
|
|
doc_id = db.Column(db.Integer, db.ForeignKey(Document.id), nullable=False)
|
|
url = db.Column(db.String(200), nullable=True)
|
|
file_location = db.Column(db.String(255), nullable=True)
|
|
file_name = db.Column(db.String(200), nullable=True)
|
|
file_type = db.Column(db.String(20), nullable=True)
|
|
language = db.Column(db.String(2), nullable=False)
|
|
user_context = db.Column(db.Text, nullable=True)
|
|
system_context = db.Column(db.Text, nullable=True)
|
|
|
|
# Versioning Information
|
|
created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now())
|
|
created_by = db.Column(db.Integer, db.ForeignKey(User.id))
|
|
updated_at = db.Column(db.DateTime, nullable=False, server_default=db.func.now(), onupdate=db.func.now())
|
|
updated_by = db.Column(db.Integer, db.ForeignKey(User.id))
|
|
|
|
# Processing Information
|
|
processing = db.Column(db.Boolean, nullable=False, default=False)
|
|
processing_started_at = db.Column(db.DateTime, nullable=True)
|
|
processing_finished_at = db.Column(db.DateTime, nullable=True)
|
|
processing_error = db.Column(db.String(255), nullable=True)
|
|
|
|
# Relations
|
|
embeddings = db.relationship('Embedding', backref='document_version', lazy=True)
|
|
|
|
def __repr__(self):
|
|
return f"<DocumentVersion {self.document_language.document_id}.{self.document_language.language}>.{self.id}>"
|
|
|
|
def calc_file_location(self):
|
|
return f"{self.document.tenant_id}/{self.document.id}/{self.language}"
|
|
|
|
def calc_file_name(self):
|
|
return f"{self.id}.{self.file_type}"
|
|
|
|
|
|
class Embedding(db.Model):
|
|
__tablename__ = 'embeddings'
|
|
id = db.Column(db.Integer, primary_key=True)
|
|
type = db.Column(db.String(30), nullable=False)
|
|
doc_vers_id = db.Column(db.Integer, db.ForeignKey(DocumentVersion.id), nullable=False)
|
|
active = db.Column(db.Boolean, nullable=False, default=True)
|
|
chunk = db.Column(db.Text, nullable=False)
|
|
|
|
__mapper_args__ = {
|
|
'polymorphic_identity': 'embedding',
|
|
'polymorphic_on': type
|
|
}
|
|
|
|
|
|
class EmbeddingMistral(Embedding):
|
|
__tablename__ = 'embedding_mistral'
|
|
id = db.Column(db.Integer, db.ForeignKey('embeddings.id'), primary_key=True)
|
|
|
|
# 1024 is the MISTRAL Embedding dimension.
|
|
# If another embedding model is chosen, this dimension may need to be changed.
|
|
embedding = db.Column(Vector(1024), nullable=False)
|
|
|
|
__mapper_args__ = {
|
|
'polymorphic_identity': 'embedding_mistral',
|
|
}
|
|
|
|
|
|
class EmbeddingSmallOpenAI(Embedding):
|
|
__tablename__ = 'embedding_small_openai'
|
|
id = db.Column(db.Integer, db.ForeignKey('embeddings.id'), primary_key=True)
|
|
|
|
# 1536 is the OpenAI Small Embedding dimension.
|
|
# If another embedding model is chosen, this dimension may need to be changed.
|
|
embedding = db.Column(Vector(1536), nullable=False)
|
|
|
|
__mapper_args__ = {
|
|
'polymorphic_identity': 'embedding_small_openai',
|
|
}
|
|
|
|
|
|
class EmbeddingLargeOpenAI(Embedding):
|
|
__tablename__ = 'embedding_large_openai'
|
|
id = db.Column(db.Integer, db.ForeignKey('embeddings.id'), primary_key=True)
|
|
|
|
# 3072 is the OpenAI Large Embedding dimension.
|
|
# If another embedding model is chosen, this dimension may need to be changed.
|
|
embedding = db.Column(Vector(3072), nullable=False)
|
|
|
|
__mapper_args__ = {
|
|
'polymorphic_identity': 'embedding_large_openai',
|
|
}
|