- Correct bug where URL can be too long due to tracking parameters ==> added clean_url function, to be called before adding an URL.

This commit is contained in:
Josako
2025-03-17 17:39:32 +01:00
parent 56a00c2894
commit a6402524ce
3 changed files with 26 additions and 5 deletions

View File

@@ -8,7 +8,7 @@ from common.extensions import db, minio_client
from common.utils.celery_utils import current_celery
from flask import current_app
import requests
from urllib.parse import urlparse, unquote, urlunparse
from urllib.parse import urlparse, unquote, urlunparse, parse_qs
import os
from .config_field_types import normalize_json_field
@@ -182,6 +182,24 @@ def process_url(url, tenant_id):
return file_content, filename, extension
def clean_url(url):
tracking_params = {"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content",
"hsa_acc", "hsa_cam", "hsa_grp", "hsa_ad", "hsa_src", "hsa_tgt", "hsa_kw",
"hsa_mt", "hsa_net", "hsa_ver", "gad_source", "gbraid"}
parsed_url = urlparse(url)
query_params = parse_qs(parsed_url.query)
# Remove tracking params
clean_params = {k: v for k, v in query_params.items() if k not in tracking_params}
# Reconstruct the URL
clean_query = "&".join(f"{k}={v[0]}" for k, v in clean_params.items()) if clean_params else ""
cleaned_url = urlunparse(parsed_url._replace(query=clean_query))
return cleaned_url
def start_embedding_task(tenant_id, doc_vers_id):
task = current_celery.send_task('create_embeddings',
args=[tenant_id, doc_vers_id,],