- Correct bug where URL can be too long due to tracking parameters ==> added clean_url function, to be called before adding an URL.

This commit is contained in:
Josako
2025-03-17 17:39:32 +01:00
parent 56a00c2894
commit a6402524ce
3 changed files with 26 additions and 5 deletions

View File

@@ -8,7 +8,7 @@ from common.extensions import db, minio_client
from common.utils.celery_utils import current_celery
from flask import current_app
import requests
from urllib.parse import urlparse, unquote, urlunparse
from urllib.parse import urlparse, unquote, urlunparse, parse_qs
import os
from .config_field_types import normalize_json_field
@@ -182,6 +182,24 @@ def process_url(url, tenant_id):
return file_content, filename, extension
def clean_url(url):
tracking_params = {"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content",
"hsa_acc", "hsa_cam", "hsa_grp", "hsa_ad", "hsa_src", "hsa_tgt", "hsa_kw",
"hsa_mt", "hsa_net", "hsa_ver", "gad_source", "gbraid"}
parsed_url = urlparse(url)
query_params = parse_qs(parsed_url.query)
# Remove tracking params
clean_params = {k: v for k, v in query_params.items() if k not in tracking_params}
# Reconstruct the URL
clean_query = "&".join(f"{k}={v[0]}" for k, v in clean_params.items()) if clean_params else ""
cleaned_url = urlunparse(parsed_url._replace(query=clean_query))
return cleaned_url
def start_embedding_task(tenant_id, doc_vers_id):
task = current_celery.send_task('create_embeddings',
args=[tenant_id, doc_vers_id,],

View File

@@ -17,7 +17,7 @@ from common.utils.document_utils import (
create_document_stack, process_url, start_embedding_task,
EveAIInvalidLanguageException, EveAIDoubleURLException, EveAIUnsupportedFileType,
get_documents_list, edit_document, refresh_document, edit_document_version,
refresh_document_with_info, lookup_document, refresh_document_with_content
refresh_document_with_info, lookup_document, refresh_document_with_content, clean_url
)
from common.utils.eveai_exceptions import EveAIException
from eveai_api.api.auth import requires_service
@@ -271,11 +271,12 @@ class AddURL(Resource):
try:
args = document_ns.payload
file_content, filename, extension = process_url(args['url'], tenant_id)
cleaned_url = clean_url(args['url'])
file_content, filename, extension = process_url(cleaned_url, tenant_id)
api_input = {
'catalog_id': args['catalog_id'],
'url': args['url'],
'url': cleaned_url,
'name': args.get('name') or filename,
'language': args['language'],
'user_context': args.get('user_context'),

View File

@@ -16,7 +16,7 @@ from common.extensions import db, cache_manager
from common.models.interaction import Specialist, SpecialistRetriever
from common.utils.document_utils import create_document_stack, start_embedding_task, process_url, \
edit_document, \
edit_document_version, refresh_document
edit_document_version, refresh_document, clean_url
from common.utils.eveai_exceptions import EveAIInvalidLanguageException, EveAIUnsupportedFileType, \
EveAIDoubleURLException
from config.type_defs.processor_types import PROCESSOR_TYPES
@@ -444,6 +444,8 @@ def add_url():
tenant_id = session['tenant']['id']
url = form.url.data
url = clean_url(url)
file_content, filename, extension = process_url(url, tenant_id)
catalog_properties = {}