- Correct bug where URL can be too long due to tracking parameters ==> added clean_url function, to be called before adding an URL.
This commit is contained in:
@@ -8,7 +8,7 @@ from common.extensions import db, minio_client
|
||||
from common.utils.celery_utils import current_celery
|
||||
from flask import current_app
|
||||
import requests
|
||||
from urllib.parse import urlparse, unquote, urlunparse
|
||||
from urllib.parse import urlparse, unquote, urlunparse, parse_qs
|
||||
import os
|
||||
|
||||
from .config_field_types import normalize_json_field
|
||||
@@ -182,6 +182,24 @@ def process_url(url, tenant_id):
|
||||
return file_content, filename, extension
|
||||
|
||||
|
||||
def clean_url(url):
|
||||
tracking_params = {"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content",
|
||||
"hsa_acc", "hsa_cam", "hsa_grp", "hsa_ad", "hsa_src", "hsa_tgt", "hsa_kw",
|
||||
"hsa_mt", "hsa_net", "hsa_ver", "gad_source", "gbraid"}
|
||||
|
||||
parsed_url = urlparse(url)
|
||||
query_params = parse_qs(parsed_url.query)
|
||||
|
||||
# Remove tracking params
|
||||
clean_params = {k: v for k, v in query_params.items() if k not in tracking_params}
|
||||
|
||||
# Reconstruct the URL
|
||||
clean_query = "&".join(f"{k}={v[0]}" for k, v in clean_params.items()) if clean_params else ""
|
||||
cleaned_url = urlunparse(parsed_url._replace(query=clean_query))
|
||||
|
||||
return cleaned_url
|
||||
|
||||
|
||||
def start_embedding_task(tenant_id, doc_vers_id):
|
||||
task = current_celery.send_task('create_embeddings',
|
||||
args=[tenant_id, doc_vers_id,],
|
||||
|
||||
Reference in New Issue
Block a user