diff --git a/common/utils/document_utils.py b/common/utils/document_utils.py index 1d0e0b5..edf1551 100644 --- a/common/utils/document_utils.py +++ b/common/utils/document_utils.py @@ -8,7 +8,7 @@ from common.extensions import db, minio_client from common.utils.celery_utils import current_celery from flask import current_app import requests -from urllib.parse import urlparse, unquote, urlunparse +from urllib.parse import urlparse, unquote, urlunparse, parse_qs import os from .config_field_types import normalize_json_field @@ -182,6 +182,24 @@ def process_url(url, tenant_id): return file_content, filename, extension +def clean_url(url): + tracking_params = {"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", + "hsa_acc", "hsa_cam", "hsa_grp", "hsa_ad", "hsa_src", "hsa_tgt", "hsa_kw", + "hsa_mt", "hsa_net", "hsa_ver", "gad_source", "gbraid"} + + parsed_url = urlparse(url) + query_params = parse_qs(parsed_url.query) + + # Remove tracking params + clean_params = {k: v for k, v in query_params.items() if k not in tracking_params} + + # Reconstruct the URL + clean_query = "&".join(f"{k}={v[0]}" for k, v in clean_params.items()) if clean_params else "" + cleaned_url = urlunparse(parsed_url._replace(query=clean_query)) + + return cleaned_url + + def start_embedding_task(tenant_id, doc_vers_id): task = current_celery.send_task('create_embeddings', args=[tenant_id, doc_vers_id,], diff --git a/eveai_api/api/document_api.py b/eveai_api/api/document_api.py index 7c0d2f3..419f1da 100644 --- a/eveai_api/api/document_api.py +++ b/eveai_api/api/document_api.py @@ -17,7 +17,7 @@ from common.utils.document_utils import ( create_document_stack, process_url, start_embedding_task, EveAIInvalidLanguageException, EveAIDoubleURLException, EveAIUnsupportedFileType, get_documents_list, edit_document, refresh_document, edit_document_version, - refresh_document_with_info, lookup_document, refresh_document_with_content + refresh_document_with_info, lookup_document, refresh_document_with_content, clean_url ) from common.utils.eveai_exceptions import EveAIException from eveai_api.api.auth import requires_service @@ -271,11 +271,12 @@ class AddURL(Resource): try: args = document_ns.payload - file_content, filename, extension = process_url(args['url'], tenant_id) + cleaned_url = clean_url(args['url']) + file_content, filename, extension = process_url(cleaned_url, tenant_id) api_input = { 'catalog_id': args['catalog_id'], - 'url': args['url'], + 'url': cleaned_url, 'name': args.get('name') or filename, 'language': args['language'], 'user_context': args.get('user_context'), diff --git a/eveai_app/views/document_views.py b/eveai_app/views/document_views.py index 5ec38ef..bf15abb 100644 --- a/eveai_app/views/document_views.py +++ b/eveai_app/views/document_views.py @@ -16,7 +16,7 @@ from common.extensions import db, cache_manager from common.models.interaction import Specialist, SpecialistRetriever from common.utils.document_utils import create_document_stack, start_embedding_task, process_url, \ edit_document, \ - edit_document_version, refresh_document + edit_document_version, refresh_document, clean_url from common.utils.eveai_exceptions import EveAIInvalidLanguageException, EveAIUnsupportedFileType, \ EveAIDoubleURLException from config.type_defs.processor_types import PROCESSOR_TYPES @@ -444,6 +444,8 @@ def add_url(): tenant_id = session['tenant']['id'] url = form.url.data + url = clean_url(url) + file_content, filename, extension = process_url(url, tenant_id) catalog_properties = {}