Compare commits

...

8 Commits

2 changed files with 168 additions and 16 deletions

View File

@ -10,12 +10,18 @@ mvw_endpoint = http://localhost:8000/api/query
title_dedup_winner = first
dl_progress_update_interval = 10
dl_threads = 2
dl_filename_pattern = &(channel)s - &(publish_date)s - &(topic)s - &(title)s.&(ext)s
publish_date_srtftime_pattern = %%Y%%m%%d
dl_filename_replace_spaces_with =
dl_filename_all_lowercase = no
[maus]
min_duration = 1200
max_duration = 2700
query = @maus-query.json
title_not_regex = audiodeskription|gebärdensprache
dl_filename_pattern = &(publish_date)s.&(ext)s
publish_date_srtftime_pattern = S%%YE%%Y%%m%%d01
# query = {"queries":[{"fields":["topic"],"query":"die sendung mit der maus"},{"fields":["channel"],"query":"ARD"}],"sortBy":"timestamp","sortOrder":"desc","future":false,"offset":0,"size":50}
# state_file_name = maus
# tmp_base_dir = %(tmp_base_dir)s/maus

178
mvw-dl.py
View File

@ -3,6 +3,7 @@ import datetime as d
import json
import logging
import os
import pathlib
import re
import sys
import time
@ -28,10 +29,6 @@ from threading import Event
from typing import Iterable
from urllib.request import urlopen
# TODO set locale for datetime and others to globally stick to en_US
download_start_time = 0
download_last_update_time = 0
size_downloaded = 0
from rich.progress import (
BarColumn,
@ -43,6 +40,10 @@ from rich.progress import (
TransferSpeedColumn,
)
# TODO set locale for datetime and others to globally stick to en_US
download_start_time = 0
download_last_update_time = 0
size_downloaded = 0
progress = Progress(
TextColumn("[bold blue]{task.fields[filename]}", justify="right"),
BarColumn(bar_width=None),
@ -71,6 +72,7 @@ JSONType = t.Union[str, int, float, bool, None, t.Dict[str, t.Any], t.List[t.Any
# 1: Config file invalid, it has no sections
# 2: Config file invalid, sections must define at least CONST.CFG_MANDATORY
# 3: No search results to download
# 4: State file already exists, has more than 0 bytes size but doesn't contain usable JSON
class CONST(object):
@ -90,7 +92,11 @@ class CONST(object):
{"key": "mvw_endpoint", "value": "http://localhost:8000/api/query"},
{"key": "title_dedup_winner", "value": "first"},
{"key": "dl_progress_update_interval", "value": "10"},
{"key": "dl_threads", "value": "2"}
{"key": "dl_threads", "value": "2"},
{"key": "dl_filename_pattern", "value": "&(channel)s - &(publish_date)s - &(topic)s - &(title)s"},
{"key": "publish_date_srtftime_pattern", "value": "%%Y%%m%%d"},
{"key": "dl_filename_replace_spaces_with", "value": "_"},
{"key": "dl_filename_all_lowercase", "value": "yes"}
]
CFG_KNOWN_SECTION = [
{"key": "min_duration", "is_mandatory": False},
@ -345,17 +351,152 @@ def handle_sigint(signum, frame):
signal.signal(signal.SIGINT, handle_sigint)
def expanded_dest_dir(
raw_dest_dir: str) -> str:
user_expanded_dest_dir = os.path.expanduser(raw_dest_dir)
all_expanded_dest_dir = os.path.expandvars(user_expanded_dest_dir)
return all_expanded_dest_dir
def filename_replace_pattern(
section_name: str,
config_obj: configparser.ConfigParser(),
show: type_def.mvw_json_response.Show,
max_quality_url: str) -> str:
filename = config_obj.get(section_name, "dl_filename_pattern")
ext = pathlib.Path(max_quality_url).suffix.lstrip(".")
publish_date = d.datetime.utcfromtimestamp(show.timestamp).strftime(
config_obj.get(section_name, "publish_date_srtftime_pattern"))
show_extended = {"ext": ext, "publish_date": publish_date}
show_attrs = [attr for attr in dir(show) if not attr.startswith('_') and not callable(getattr(show, attr))]
for attr in show_attrs:
log.debug(f"Replacing filename pattern '&({attr})s' ...")
filename = re.sub(r"&\(" + re.escape(attr) + r"\)s", str(getattr(show, attr)), filename)
log.debug(f"New filename: '{filename}'")
for extended_attr in show_extended:
log.debug(f"Replacing filename pattern '&({extended_attr})s' ...")
filename = re.sub(r"&\(" + re.escape(extended_attr) + r"\)s", show_extended[extended_attr], filename)
log.debug(f"New filename: '{filename}'")
return filename
def get_safe_filename(
dirty_filename: str) -> str:
"""https://stackoverflow.com/a/71199182"""
clean_filename = re.sub(r"[/\\?%*:|\"<>\x7F\x00-\x1F]", "-", dirty_filename)
log.debug(f"Replacing unsafe characters in filename with dashes ...")
clean_filename = re.sub(r"""[/\\?%*:|"<>\x7F\x00-\x1F]""", "-", dirty_filename)
log.debug(f"New filename: '{clean_filename}'")
return clean_filename
def filename_replace_spaces_with_underscores(
section_name: str,
config_obj: configparser.ConfigParser(),
filename: str) -> str:
space_replace_string = config_obj.get(section_name, "dl_filename_replace_spaces_with")
log.debug(f"Replacing space characters with '{space_replace_string}' ...")
underscored_filename = re.sub(
r"\s",
space_replace_string,
filename)
log.debug(f"New filename: '{underscored_filename}'")
return underscored_filename
def get_filename(
section_name: str,
config_obj: configparser.ConfigParser(),
show: type_def.mvw_json_response.Show,
max_quality_url: str) -> str:
filename_replaced_patterns = filename_replace_pattern(section_name, config_obj, show, max_quality_url)
filename_safe = get_safe_filename(filename_replaced_patterns)
if config.get(section_name, "dl_filename_replace_spaces_with"):
filename_safe = filename_replace_spaces_with_underscores(section_name, config_obj, filename_safe)
if config.getboolean(section_name, "dl_filename_all_lowercase"):
log.debug(f"Lowercasing all filename characters ...")
filename_safe = filename_safe.lower()
log.debug(f"New filename: '{filename_safe}'")
log.debug(filename_safe)
return filename_safe
def get_state_file_abs_path(
section_name: str,
config_obj: configparser.ConfigParser()) -> str:
state_dir = config_obj.get(section_name, "state_files_dir")
state_file = \
config_obj.get(section_name, "state_file_name_prefix") + \
section_name + \
config_obj.get(section_name, "state_file_name_suffix")
state_file_abs_path = os.path.join(state_dir, state_file)
return state_file_abs_path
def state_file_none_or_valid_json(
state_file_abs_path: str) -> bool:
if os.path.exists(state_file_abs_path):
if os.path.getsize(state_file_abs_path) > 0:
with open(state_file_abs_path, "r") as state_file:
try:
json.loads(state_file.read())
return True
except json.JSONDecodeError:
log.warning(f"State file '{state_file_abs_path}' does not contain valid JSON. We're not going to "
f"be able to log anything into it. Exiting 4 ...")
sys.exit(4)
else:
return True
else:
return True
def truncate_log(
json_data: json.loads,
max_log_entries: int) -> json.loads:
for i in range(len(json_data)):
del json_data[i]
if len(json_data) <= max_log_entries:
break
return json_data
def log_successful_download(
show: type_def.mvw_json_response.Show) -> None:
pass
section_name: str,
config_obj: configparser.ConfigParser(),
show: type_def.mvw_json_response.Show,
state_file_abs_path: str) -> None:
timestamp_now = int(time.time())
state_file_none_or_valid_json(state_file_abs_path)
os.makedirs(os.path.dirname(state_file_abs_path), exist_ok=True)
state_body = show.dict(include={"topic", "title"})
state_body["dl_complete_timestamp_epoch"] = timestamp_now
state_body["dl_complete_timestamp_human"] = \
d.datetime.utcfromtimestamp(timestamp_now).strftime("%Y-%m-%d %H%M%S UTC")
state_entry = {timestamp_now: state_body}
json_state = None
log.debug(f"Writing log entry to '{state_file_abs_path}' ...")
with open(state_file_abs_path, "r+") as state_file:
try:
json_state = json.load(state_file)
except json.JSONDecodeError:
if json_state is None:
state_file.truncate()
json_state = []
with open(state_file_abs_path, "w") as state_file:
json_state.append(state_entry)
max_log_entries = config_obj.getint(section_name, "state_file_retention")
if len(json_state) > max_log_entries:
json_state = truncate_log(json_state, max_log_entries)
json.dump(json_state, state_file, indent=4, sort_keys=True)
def copy_url(
@ -363,7 +504,8 @@ def copy_url(
config_obj: configparser.ConfigParser(),
show: type_def.mvw_json_response.Show,
video_metadata: dict,
total_content_length: int) -> None:
total_content_length: int,
state_file_abs_path: str) -> None:
"""Copy data from a url to a local file."""
global download_start_time
@ -372,15 +514,15 @@ def copy_url(
update_interval = config_obj.getint(section_name, "dl_progress_update_interval")
max_quality_url = video_metadata["url"]
filename = max_quality_url.split("/")[-1]
dest_dir = config_obj.get(section_name, "dl_dir")
filename = get_filename(section_name, config_obj, show, max_quality_url)
dest_dir = expanded_dest_dir(config_obj.get(section_name, "dl_dir"))
dest_path = os.path.join(dest_dir, filename)
dest_path = os.path.expanduser(dest_path)
dest_path = os.path.expandvars(dest_path)
show_name = f"{show.topic} - {show.title}"
publish_date = d.datetime.utcfromtimestamp(show.timestamp).strftime('%Y%m%d')
os.makedirs(os.path.dirname(dest_path), exist_ok=True)
# TODO quit
log_successful_download(section_name, config_obj, show, state_file_abs_path)
quit()
with open(dest_path, "wb") as dest_file:
log.info(f"""Downloading "{show_name}" ...""")
log.info(f"Download location resolved to {dest_path}")
@ -402,7 +544,7 @@ def copy_url(
log.info(f"""Download of "{show_name}" interrupted""")
return
log.info(f"""Download of "{show_name}" done""")
log_successful_download(show)
# log_successful_download(show)
def get_max_quality_url(
@ -434,6 +576,8 @@ def download_media(
global download_last_update_time
dl_threads = config_obj.getint(section_name, "dl_threads")
state_file_abs_path = get_state_file_abs_path(section_name, config_obj)
state_file_none_or_valid_json(state_file_abs_path)
video_metadata = {}
for result in json_obj.result.results.copy():
@ -446,6 +590,7 @@ def download_media(
video_metadata["total_content_length"] = total_content_length
log.info(f"""Download location is {config_obj.get(section_name, "dl_dir")}""")
log.info(f"Limiting parallel downloads to {dl_threads} ...")
# TODO prior to download check state file
with ThreadPoolExecutor(max_workers=dl_threads) as pool:
download_last_update_time = time.time()
download_start_time = download_last_update_time
@ -458,7 +603,8 @@ def download_media(
config_obj,
result,
video_metadata[result.id],
video_metadata["total_content_length"])
video_metadata["total_content_length"],
state_file_abs_path)
if __name__ == '__main__':