- Correct bug where URL can be too long due to tracking parameters ==> added clean_url function, to be called before adding an URL.
This commit is contained in:
@@ -8,7 +8,7 @@ from common.extensions import db, minio_client
|
||||
from common.utils.celery_utils import current_celery
|
||||
from flask import current_app
|
||||
import requests
|
||||
from urllib.parse import urlparse, unquote, urlunparse
|
||||
from urllib.parse import urlparse, unquote, urlunparse, parse_qs
|
||||
import os
|
||||
|
||||
from .config_field_types import normalize_json_field
|
||||
@@ -182,6 +182,24 @@ def process_url(url, tenant_id):
|
||||
return file_content, filename, extension
|
||||
|
||||
|
||||
def clean_url(url):
|
||||
tracking_params = {"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content",
|
||||
"hsa_acc", "hsa_cam", "hsa_grp", "hsa_ad", "hsa_src", "hsa_tgt", "hsa_kw",
|
||||
"hsa_mt", "hsa_net", "hsa_ver", "gad_source", "gbraid"}
|
||||
|
||||
parsed_url = urlparse(url)
|
||||
query_params = parse_qs(parsed_url.query)
|
||||
|
||||
# Remove tracking params
|
||||
clean_params = {k: v for k, v in query_params.items() if k not in tracking_params}
|
||||
|
||||
# Reconstruct the URL
|
||||
clean_query = "&".join(f"{k}={v[0]}" for k, v in clean_params.items()) if clean_params else ""
|
||||
cleaned_url = urlunparse(parsed_url._replace(query=clean_query))
|
||||
|
||||
return cleaned_url
|
||||
|
||||
|
||||
def start_embedding_task(tenant_id, doc_vers_id):
|
||||
task = current_celery.send_task('create_embeddings',
|
||||
args=[tenant_id, doc_vers_id,],
|
||||
|
||||
@@ -17,7 +17,7 @@ from common.utils.document_utils import (
|
||||
create_document_stack, process_url, start_embedding_task,
|
||||
EveAIInvalidLanguageException, EveAIDoubleURLException, EveAIUnsupportedFileType,
|
||||
get_documents_list, edit_document, refresh_document, edit_document_version,
|
||||
refresh_document_with_info, lookup_document, refresh_document_with_content
|
||||
refresh_document_with_info, lookup_document, refresh_document_with_content, clean_url
|
||||
)
|
||||
from common.utils.eveai_exceptions import EveAIException
|
||||
from eveai_api.api.auth import requires_service
|
||||
@@ -271,11 +271,12 @@ class AddURL(Resource):
|
||||
|
||||
try:
|
||||
args = document_ns.payload
|
||||
file_content, filename, extension = process_url(args['url'], tenant_id)
|
||||
cleaned_url = clean_url(args['url'])
|
||||
file_content, filename, extension = process_url(cleaned_url, tenant_id)
|
||||
|
||||
api_input = {
|
||||
'catalog_id': args['catalog_id'],
|
||||
'url': args['url'],
|
||||
'url': cleaned_url,
|
||||
'name': args.get('name') or filename,
|
||||
'language': args['language'],
|
||||
'user_context': args.get('user_context'),
|
||||
|
||||
@@ -16,7 +16,7 @@ from common.extensions import db, cache_manager
|
||||
from common.models.interaction import Specialist, SpecialistRetriever
|
||||
from common.utils.document_utils import create_document_stack, start_embedding_task, process_url, \
|
||||
edit_document, \
|
||||
edit_document_version, refresh_document
|
||||
edit_document_version, refresh_document, clean_url
|
||||
from common.utils.eveai_exceptions import EveAIInvalidLanguageException, EveAIUnsupportedFileType, \
|
||||
EveAIDoubleURLException
|
||||
from config.type_defs.processor_types import PROCESSOR_TYPES
|
||||
@@ -444,6 +444,8 @@ def add_url():
|
||||
tenant_id = session['tenant']['id']
|
||||
url = form.url.data
|
||||
|
||||
url = clean_url(url)
|
||||
|
||||
file_content, filename, extension = process_url(url, tenant_id)
|
||||
|
||||
catalog_properties = {}
|
||||
|
||||
Reference in New Issue
Block a user