- Improve annotation algorithm for Youtube (and others)

- Patch Pytube - improve OS deletion of files and writing of files - Start working on Claude - Improve template management
2024-07-16 14:21:49 +02:00
parent db44fd3b66
commit 908a2eaf7e
39 changed files with 6427 additions and 324 deletions
--- a/patched_packages/pytube/request.py
+++ b/patched_packages/pytube/request.py
@@ -0,0 +1,269 @@
+"""Implements a simple wrapper around urlopen."""
+import http.client
+import json
+import logging
+import re
+import socket
+from functools import lru_cache
+from urllib import parse
+from urllib.error import URLError
+from urllib.request import Request, urlopen
+
+from pytube.exceptions import RegexMatchError, MaxRetriesExceeded
+from pytube.helpers import regex_search
+
+logger = logging.getLogger(__name__)
+default_range_size = 9437184  # 9MB
+
+
+def _execute_request(
+    url,
+    method=None,
+    headers=None,
+    data=None,
+    timeout=socket._GLOBAL_DEFAULT_TIMEOUT
+):
+    base_headers = {"User-Agent": "Mozilla/5.0", "accept-language": "en-US,en"}
+    if headers:
+        base_headers.update(headers)
+    if data:
+        # encode data for request
+        if not isinstance(data, bytes):
+            data = bytes(json.dumps(data), encoding="utf-8")
+    if url.lower().startswith("http"):
+        request = Request(url, headers=base_headers, method=method, data=data)
+    else:
+        raise ValueError("Invalid URL")
+    return urlopen(request, timeout=timeout)  # nosec
+
+
+def get(url, extra_headers=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
+    """Send an http GET request.
+
+    :param str url:
+        The URL to perform the GET request for.
+    :param dict extra_headers:
+        Extra headers to add to the request
+    :rtype: str
+    :returns:
+        UTF-8 encoded string of response
+    """
+    if extra_headers is None:
+        extra_headers = {}
+    response = _execute_request(url, headers=extra_headers, timeout=timeout)
+    return response.read().decode("utf-8")
+
+
+def post(url, extra_headers=None, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
+    """Send an http POST request.
+
+    :param str url:
+        The URL to perform the POST request for.
+    :param dict extra_headers:
+        Extra headers to add to the request
+    :param dict data:
+        The data to send on the POST request
+    :rtype: str
+    :returns:
+        UTF-8 encoded string of response
+    """
+    # could technically be implemented in get,
+    # but to avoid confusion implemented like this
+    if extra_headers is None:
+        extra_headers = {}
+    if data is None:
+        data = {}
+    # required because the youtube servers are strict on content type
+    # raises HTTPError [400]: Bad Request otherwise
+    extra_headers.update({"Content-Type": "application/json"})
+    response = _execute_request(
+        url,
+        headers=extra_headers,
+        data=data,
+        timeout=timeout
+    )
+    return response.read().decode("utf-8")
+
+
+def seq_stream(
+    url,
+    timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
+    max_retries=0
+):
+    """Read the response in sequence.
+    :param str url: The URL to perform the GET request for.
+    :rtype: Iterable[bytes]
+    """
+    # YouTube expects a request sequence number as part of the parameters.
+    split_url = parse.urlsplit(url)
+    base_url = '%s://%s/%s?' % (split_url.scheme, split_url.netloc, split_url.path)
+
+    querys = dict(parse.parse_qsl(split_url.query))
+
+    # The 0th sequential request provides the file headers, which tell us
+    #  information about how the file is segmented.
+    querys['sq'] = 0
+    url = base_url + parse.urlencode(querys)
+
+    segment_data = b''
+    for chunk in stream(url, timeout=timeout, max_retries=max_retries):
+        yield chunk
+        segment_data += chunk
+
+    # We can then parse the header to find the number of segments
+    stream_info = segment_data.split(b'\r\n')
+    segment_count_pattern = re.compile(b'Segment-Count: (\\d+)')
+    for line in stream_info:
+        match = segment_count_pattern.search(line)
+        if match:
+            segment_count = int(match.group(1).decode('utf-8'))
+
+    # We request these segments sequentially to build the file.
+    seq_num = 1
+    while seq_num <= segment_count:
+        # Create sequential request URL
+        querys['sq'] = seq_num
+        url = base_url + parse.urlencode(querys)
+
+        yield from stream(url, timeout=timeout, max_retries=max_retries)
+        seq_num += 1
+    return  # pylint: disable=R1711
+
+
+def stream(
+    url,
+    timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
+    max_retries=0
+):
+    """Read the response in chunks.
+    :param str url: The URL to perform the GET request for.
+    :rtype: Iterable[bytes]
+    """
+    file_size: int = default_range_size  # fake filesize to start
+    downloaded = 0
+    while downloaded < file_size:
+        stop_pos = min(downloaded + default_range_size, file_size) - 1
+        range_header = f"bytes={downloaded}-{stop_pos}"
+        tries = 0
+
+        # Attempt to make the request multiple times as necessary.
+        while True:
+            # If the max retries is exceeded, raise an exception
+            if tries >= 1 + max_retries:
+                raise MaxRetriesExceeded()
+
+            # Try to execute the request, ignoring socket timeouts
+            try:
+                response = _execute_request(
+                    url + f"&range={downloaded}-{stop_pos}",
+                    method="GET",
+                    timeout=timeout
+                )
+            except URLError as e:
+                # We only want to skip over timeout errors, and
+                # raise any other URLError exceptions
+                if isinstance(e.reason, socket.timeout):
+                    pass
+                else:
+                    raise
+            except http.client.IncompleteRead:
+                # Allow retries on IncompleteRead errors for unreliable connections
+                pass
+            else:
+                # On a successful request, break from loop
+                break
+            tries += 1
+
+        if file_size == default_range_size:
+            try:
+                resp = _execute_request(
+                    url + f"&range={0}-{99999999999}",
+                    method="GET",
+                    timeout=timeout
+                )
+                content_range = resp.info()["Content-Length"]
+                file_size = int(content_range)
+            except (KeyError, IndexError, ValueError) as e:
+                logger.error(e)
+        while True:
+            chunk = response.read()
+            if not chunk:
+                break
+            downloaded += len(chunk)
+            yield chunk
+    return  # pylint: disable=R1711
+
+
+@lru_cache()
+def filesize(url):
+    """Fetch size in bytes of file at given URL
+
+    :param str url: The URL to get the size of
+    :returns: int: size in bytes of remote file
+    """
+    return int(head(url)["content-length"])
+
+
+@lru_cache()
+def seq_filesize(url):
+    """Fetch size in bytes of file at given URL from sequential requests
+
+    :param str url: The URL to get the size of
+    :returns: int: size in bytes of remote file
+    """
+    total_filesize = 0
+    # YouTube expects a request sequence number as part of the parameters.
+    split_url = parse.urlsplit(url)
+    base_url = '%s://%s/%s?' % (split_url.scheme, split_url.netloc, split_url.path)
+    querys = dict(parse.parse_qsl(split_url.query))
+
+    # The 0th sequential request provides the file headers, which tell us
+    #  information about how the file is segmented.
+    querys['sq'] = 0
+    url = base_url + parse.urlencode(querys)
+    response = _execute_request(
+        url, method="GET"
+    )
+
+    response_value = response.read()
+    # The file header must be added to the total filesize
+    total_filesize += len(response_value)
+
+    # We can then parse the header to find the number of segments
+    segment_count = 0
+    stream_info = response_value.split(b'\r\n')
+    segment_regex = b'Segment-Count: (\\d+)'
+    for line in stream_info:
+        # One of the lines should contain the segment count, but we don't know
+        #  which, so we need to iterate through the lines to find it
+        try:
+            segment_count = int(regex_search(segment_regex, line, 1))
+        except RegexMatchError:
+            pass
+
+    if segment_count == 0:
+        raise RegexMatchError('seq_filesize', segment_regex)
+
+    # We make HEAD requests to the segments sequentially to find the total filesize.
+    seq_num = 1
+    while seq_num <= segment_count:
+        # Create sequential request URL
+        querys['sq'] = seq_num
+        url = base_url + parse.urlencode(querys)
+
+        total_filesize += int(head(url)['content-length'])
+        seq_num += 1
+    return total_filesize
+
+
+def head(url):
+    """Fetch headers returned http GET request.
+
+    :param str url:
+        The URL to perform the GET request for.
+    :rtype: dict
+    :returns:
+        dictionary of lowercase headers
+    """
+    response_headers = _execute_request(url, method="HEAD").info()
+    return {k.lower(): v for k, v in response_headers.items()}