- Improve annotation algorithm for Youtube (and others)
- Patch Pytube - improve OS deletion of files and writing of files - Start working on Claude - Improve template management
This commit is contained in:
164
patched_packages/pytube/captions.py
Normal file
164
patched_packages/pytube/captions.py
Normal file
@@ -0,0 +1,164 @@
|
||||
import math
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
import xml.etree.ElementTree as ElementTree
|
||||
from html import unescape
|
||||
from typing import Dict, Optional
|
||||
|
||||
from pytube import request
|
||||
from pytube.helpers import safe_filename, target_directory
|
||||
|
||||
|
||||
class Caption:
|
||||
"""Container for caption tracks."""
|
||||
|
||||
def __init__(self, caption_track: Dict):
|
||||
"""Construct a :class:`Caption <Caption>`.
|
||||
|
||||
:param dict caption_track:
|
||||
Caption track data extracted from ``watch_html``.
|
||||
"""
|
||||
self.url = caption_track.get("baseUrl")
|
||||
|
||||
# Certain videos have runs instead of simpleText
|
||||
# this handles that edge case
|
||||
name_dict = caption_track['name']
|
||||
if 'simpleText' in name_dict:
|
||||
self.name = name_dict['simpleText']
|
||||
else:
|
||||
for el in name_dict['runs']:
|
||||
if 'text' in el:
|
||||
self.name = el['text']
|
||||
|
||||
# Use "vssId" instead of "languageCode", fix issue #779
|
||||
self.code = caption_track["vssId"]
|
||||
# Remove preceding '.' for backwards compatibility, e.g.:
|
||||
# English -> vssId: .en, languageCode: en
|
||||
# English (auto-generated) -> vssId: a.en, languageCode: en
|
||||
self.code = self.code.strip('.')
|
||||
|
||||
@property
|
||||
def xml_captions(self) -> str:
|
||||
"""Download the xml caption tracks."""
|
||||
return request.get(self.url)
|
||||
|
||||
@property
|
||||
def json_captions(self) -> dict:
|
||||
"""Download and parse the json caption tracks."""
|
||||
json_captions_url = self.url.replace('fmt=srv3','fmt=json3')
|
||||
text = request.get(json_captions_url)
|
||||
parsed = json.loads(text)
|
||||
assert parsed['wireMagic'] == 'pb3', 'Unexpected captions format'
|
||||
return parsed
|
||||
|
||||
def generate_srt_captions(self) -> str:
|
||||
"""Generate "SubRip Subtitle" captions.
|
||||
|
||||
Takes the xml captions from :meth:`~pytube.Caption.xml_captions` and
|
||||
recompiles them into the "SubRip Subtitle" format.
|
||||
"""
|
||||
return self.xml_caption_to_srt(self.xml_captions)
|
||||
|
||||
@staticmethod
|
||||
def float_to_srt_time_format(d: float) -> str:
|
||||
"""Convert decimal durations into proper srt format.
|
||||
|
||||
:rtype: str
|
||||
:returns:
|
||||
SubRip Subtitle (str) formatted time duration.
|
||||
|
||||
float_to_srt_time_format(3.89) -> '00:00:03,890'
|
||||
"""
|
||||
fraction, whole = math.modf(d)
|
||||
time_fmt = time.strftime("%H:%M:%S,", time.gmtime(whole))
|
||||
ms = f"{fraction:.3f}".replace("0.", "")
|
||||
return time_fmt + ms
|
||||
|
||||
def xml_caption_to_srt(self, xml_captions: str) -> str:
|
||||
"""Convert xml caption tracks to "SubRip Subtitle (srt)".
|
||||
|
||||
:param str xml_captions:
|
||||
XML formatted caption tracks.
|
||||
"""
|
||||
segments = []
|
||||
root = ElementTree.fromstring(xml_captions)
|
||||
for i, child in enumerate(list(root)):
|
||||
text = child.text or ""
|
||||
caption = unescape(text.replace("\n", " ").replace(" ", " "),)
|
||||
try:
|
||||
duration = float(child.attrib["dur"])
|
||||
except KeyError:
|
||||
duration = 0.0
|
||||
start = float(child.attrib["start"])
|
||||
end = start + duration
|
||||
sequence_number = i + 1 # convert from 0-indexed to 1.
|
||||
line = "{seq}\n{start} --> {end}\n{text}\n".format(
|
||||
seq=sequence_number,
|
||||
start=self.float_to_srt_time_format(start),
|
||||
end=self.float_to_srt_time_format(end),
|
||||
text=caption,
|
||||
)
|
||||
segments.append(line)
|
||||
return "\n".join(segments).strip()
|
||||
|
||||
def download(
|
||||
self,
|
||||
title: str,
|
||||
srt: bool = True,
|
||||
output_path: Optional[str] = None,
|
||||
filename_prefix: Optional[str] = None,
|
||||
) -> str:
|
||||
"""Write the media stream to disk.
|
||||
|
||||
:param title:
|
||||
Output filename (stem only) for writing media file.
|
||||
If one is not specified, the default filename is used.
|
||||
:type title: str
|
||||
:param srt:
|
||||
Set to True to download srt, false to download xml. Defaults to True.
|
||||
:type srt bool
|
||||
:param output_path:
|
||||
(optional) Output path for writing media file. If one is not
|
||||
specified, defaults to the current working directory.
|
||||
:type output_path: str or None
|
||||
:param filename_prefix:
|
||||
(optional) A string that will be prepended to the filename.
|
||||
For example a number in a playlist or the name of a series.
|
||||
If one is not specified, nothing will be prepended
|
||||
This is separate from filename so you can use the default
|
||||
filename but still add a prefix.
|
||||
:type filename_prefix: str or None
|
||||
|
||||
:rtype: str
|
||||
"""
|
||||
if title.endswith(".srt") or title.endswith(".xml"):
|
||||
filename = ".".join(title.split(".")[:-1])
|
||||
else:
|
||||
filename = title
|
||||
|
||||
if filename_prefix:
|
||||
filename = f"{safe_filename(filename_prefix)}{filename}"
|
||||
|
||||
filename = safe_filename(filename)
|
||||
|
||||
filename += f" ({self.code})"
|
||||
|
||||
if srt:
|
||||
filename += ".srt"
|
||||
else:
|
||||
filename += ".xml"
|
||||
|
||||
file_path = os.path.join(target_directory(output_path), filename)
|
||||
|
||||
with open(file_path, "w", encoding="utf-8") as file_handle:
|
||||
if srt:
|
||||
file_handle.write(self.generate_srt_captions())
|
||||
else:
|
||||
file_handle.write(self.xml_captions)
|
||||
|
||||
return file_path
|
||||
|
||||
def __repr__(self):
|
||||
"""Printable object representation."""
|
||||
return '<Caption lang="{s.name}" code="{s.code}">'.format(s=self)
|
||||
Reference in New Issue
Block a user