- Improve annotation algorithm for Youtube (and others)

- Patch Pytube - improve OS deletion of files and writing of files - Start working on Claude - Improve template management
2024-07-16 14:21:49 +02:00
parent db44fd3b66
commit 908a2eaf7e
39 changed files with 6427 additions and 324 deletions
--- a/patched_packages/pytube/parser.py
+++ b/patched_packages/pytube/parser.py
@@ -0,0 +1,185 @@
+import ast
+import json
+import re
+from pytube.exceptions import HTMLParseError
+
+
+def parse_for_all_objects(html, preceding_regex):
+    """Parses input html to find all matches for the input starting point.
+
+    :param str html:
+        HTML to be parsed for an object.
+    :param str preceding_regex:
+        Regex to find the string preceding the object.
+    :rtype list:
+    :returns:
+        A list of dicts created from parsing the objects.
+    """
+    result = []
+    regex = re.compile(preceding_regex)
+    match_iter = regex.finditer(html)
+    for match in match_iter:
+        if match:
+            start_index = match.end()
+            try:
+                obj = parse_for_object_from_startpoint(html, start_index)
+            except HTMLParseError:
+                # Some of the instances might fail because set is technically
+                # a method of the ytcfg object. We'll skip these since they
+                # don't seem relevant at the moment.
+                continue
+            else:
+                result.append(obj)
+
+    if len(result) == 0:
+        raise HTMLParseError(f'No matches for regex {preceding_regex}')
+
+    return result
+
+
+def parse_for_object(html, preceding_regex):
+    """Parses input html to find the end of a JavaScript object.
+
+    :param str html:
+        HTML to be parsed for an object.
+    :param str preceding_regex:
+        Regex to find the string preceding the object.
+    :rtype dict:
+    :returns:
+        A dict created from parsing the object.
+    """
+    regex = re.compile(preceding_regex)
+    result = regex.search(html)
+    if not result:
+        raise HTMLParseError(f'No matches for regex {preceding_regex}')
+
+    start_index = result.end()
+    return parse_for_object_from_startpoint(html, start_index)
+
+
+def find_object_from_startpoint(html, start_point):
+    """Parses input html to find the end of a JavaScript object.
+
+    :param str html:
+        HTML to be parsed for an object.
+    :param int start_point:
+        Index of where the object starts.
+    :rtype dict:
+    :returns:
+        A dict created from parsing the object.
+    """
+    html = html[start_point:]
+    if html[0] not in ['{','[']:
+        raise HTMLParseError(f'Invalid start point. Start of HTML:\n{html[:20]}')
+
+    # First letter MUST be a open brace, so we put that in the stack,
+    # and skip the first character.
+    last_char = '{'
+    curr_char = None
+    stack = [html[0]]
+    i = 1
+
+    context_closers = {
+        '{': '}',
+        '[': ']',
+        '"': '"',
+        '/': '/' # javascript regex
+    }
+
+    while i < len(html):
+        if len(stack) == 0:
+            break
+        if curr_char not in [' ', '\n']:
+            last_char = curr_char
+        curr_char = html[i]
+        curr_context = stack[-1]
+
+        # If we've reached a context closer, we can remove an element off the stack
+        if curr_char == context_closers[curr_context]:
+            stack.pop()
+            i += 1
+            continue
+
+        # Strings and regex expressions require special context handling because they can contain
+        #  context openers *and* closers
+        if curr_context in ['"', '/']:
+            # If there's a backslash in a string or regex expression, we skip a character
+            if curr_char == '\\':
+                i += 2
+                continue
+        else:
+            # Non-string contexts are when we need to look for context openers.
+            if curr_char in context_closers.keys():
+                # Slash starts a regular expression depending on context
+                if not (curr_char == '/' and last_char not in ['(', ',', '=', ':', '[', '!', '&', '|', '?', '{', '}', ';']): 
+                    stack.append(curr_char)
+
+        i += 1
+
+    full_obj = html[:i]
+    return full_obj  # noqa: R504
+
+
+def parse_for_object_from_startpoint(html, start_point):
+    """JSONifies an object parsed from HTML.
+
+    :param str html:
+        HTML to be parsed for an object.
+    :param int start_point:
+        Index of where the object starts.
+    :rtype dict:
+    :returns:
+        A dict created from parsing the object.
+    """
+    full_obj = find_object_from_startpoint(html, start_point)
+    try:
+        return json.loads(full_obj)
+    except json.decoder.JSONDecodeError:
+        try:
+            return ast.literal_eval(full_obj)
+        except (ValueError, SyntaxError):
+            raise HTMLParseError('Could not parse object.')
+
+
+def throttling_array_split(js_array):
+    """Parses the throttling array into a python list of strings.
+
+    Expects input to begin with `[` and close with `]`.
+
+    :param str js_array:
+        The javascript array, as a string.
+    :rtype: list:
+    :returns:
+        A list of strings representing splits on `,` in the throttling array.
+    """
+    results = []
+    curr_substring = js_array[1:]
+
+    comma_regex = re.compile(r",")
+    func_regex = re.compile(r"function\([^)]*\)")
+
+    while len(curr_substring) > 0:
+        if curr_substring.startswith('function'):
+            # Handle functions separately. These can contain commas
+            match = func_regex.search(curr_substring)
+            match_start, match_end = match.span()
+
+            function_text = find_object_from_startpoint(curr_substring, match.span()[1])
+            full_function_def = curr_substring[:match_end + len(function_text)]
+            results.append(full_function_def)
+            curr_substring = curr_substring[len(full_function_def) + 1:]
+        else:
+            match = comma_regex.search(curr_substring)
+
+            # Try-catch to capture end of array
+            try:
+                match_start, match_end = match.span()
+            except AttributeError:
+                match_start = len(curr_substring) - 1
+                match_end = match_start + 1
+
+            curr_el = curr_substring[:match_start]
+            results.append(curr_el)
+            curr_substring = curr_substring[match_end:]
+
+    return results