Compare commits

...

37 Commits

Author SHA1 Message Date
eabf595ff5 systemd timer unit example has on 'OnCalendar' instruction 2022-03-26 23:12:42 +01:00
ab0a82c626 Hide log timestamps, intended use case is inside a systemd service unit anyway where systemd provides timestamps 2022-03-26 23:11:53 +01:00
03b449c768 systemd service unit will run on a timer, change unit type to oneshot and supply a timer unit file 2022-03-23 23:40:41 +01:00
e269a110a6 When cleaning file name remove question marks instead of replacing them with dashes 2022-03-23 23:39:32 +01:00
a3a375d142 Config and JSON files by default live in script's dir 2022-03-23 23:39:04 +01:00
81ce5812a6 Account for situations where a state file does not (yet) exist 2022-03-23 23:38:25 +01:00
83921912a4 Add to-dos 2022-03-23 23:37:19 +01:00
65e3ec83b1 Add example systemd service unit file 2022-03-23 15:53:31 +01:00
266d3189dc Replace our maus-query.json file with an example with sane defaults 2022-03-23 15:49:54 +01:00
563ff4d342 Replace our config.ini with a sane example 2022-03-23 15:47:33 +01:00
287a755e65 Streamline download selection 2022-03-20 02:35:10 +01:00
d70766bae0 If a partial download exists in temp dir we resume it 2022-03-20 02:34:15 +01:00
380fb4bf2e Calculate total downloadable content length after deciding which downloads we need 2022-03-20 02:33:32 +01:00
e395309011 Cosmetics, remove unnecessary lines, rewrite help texts a bit 2022-03-20 02:31:55 +01:00
029d9ffb7e When replacing filename pattern strings only log the ones we're seeing in config file 2022-03-20 02:30:23 +01:00
ec612de2dd Remove event handler from downloads 2022-03-20 02:29:22 +01:00
2905ff5c74 Fix encoding for JSON files 2022-03-20 02:28:14 +01:00
0cfe47465d Fix encoding for JSON files 2022-03-20 02:27:55 +01:00
5eff7876bc Cosmetics, lower urllib3.connectionpool log level back to WARNING 2022-03-20 02:25:39 +01:00
4809846edf Correctly calculate download speed if we're resuming 2022-03-20 02:25:02 +01:00
b5dff485d9 Move downloaded file into target location even across file system boundaries 2022-03-20 02:22:12 +01:00
e78659b2de Example JSON query uses 100 results 2022-03-20 02:17:36 +01:00
27004a5294 By default 'hörfassung' isn't needed for downloads 2022-03-20 02:16:52 +01:00
d0552f9e67 Check if file was previously downloaded 2022-03-19 16:58:27 +01:00
dd6464de5d Start implementing range downloading 2022-03-19 16:58:04 +01:00
facfe4e7d3 Check state file for previous downloads, improve logging per thread 2022-03-19 16:08:12 +01:00
c0a271d0eb Cosmetics, add trailing newline 2022-03-19 14:13:45 +01:00
11b9daa729 Update urllib3 to 1.26.9 2022-03-19 14:13:17 +01:00
9a5ce9469d Use filelock to make sure we don't try to write state info from multiple download threads simultaneously 2022-03-19 14:12:42 +01:00
3f6bc46d52 Cosmetics, remove unused variable 2022-03-19 08:51:31 +01:00
601583afc3 Maintain a state file 2022-03-19 08:50:51 +01:00
7081c6b50a Generate safe filename 2022-03-19 03:52:26 +01:00
7b391be89e Cosmetics, replace safe filename regex with triple-quoted string so we don't have to baĚĄash-escape literal double quote 2022-03-19 01:53:39 +01:00
c5f74ce479 Separate function to expand download directory 2022-03-19 01:52:31 +01:00
4c327a35f5 Cosmetics, put imports at top of file 2022-03-19 01:51:44 +01:00
5e321686d0 Cosmetics, put imports at top of file 2022-03-19 01:51:11 +01:00
e02c0bf52e Add options to customize output filename 2022-03-19 01:50:05 +01:00
8 changed files with 409 additions and 100 deletions

View File

@@ -1,28 +0,0 @@
[DEFAULT]
self_name = mvw-dl
tmp_base_dir = /tmp/%(self_name)s
state_base_dir = /var/lib/%(self_name)s
state_files_dir = %(state_base_dir)s/state
state_file_retention = 50
state_file_name_prefix = state-
state_file_name_suffix = .log
mvw_endpoint = http://localhost:8000/api/query
title_dedup_winner = first
dl_progress_update_interval = 10
dl_threads = 2
[maus]
min_duration = 1200
max_duration = 2700
query = @maus-query.json
title_not_regex = audiodeskription|gebärdensprache
# query = {"queries":[{"fields":["topic"],"query":"die sendung mit der maus"},{"fields":["channel"],"query":"ARD"}],"sortBy":"timestamp","sortOrder":"desc","future":false,"offset":0,"size":50}
# state_file_name = maus
# tmp_base_dir = %(tmp_base_dir)s/maus
dl_dir = ~/maus
#[test]
#min_duration = 100
#max_duration = 200
#query = {"queries":[{"fields":["topic"],"query":"die sendung mit der maus"},{"fields":["channel"],"query":"ARD"}],"sortBy":"timestamp","sortOrder":"desc","future":false,"offset":0,"size":50}
#dl_dir = test

View File

@@ -0,0 +1,25 @@
[DEFAULT]
self_name = mvw-dl
tmp_base_dir = /tmp/%(self_name)s
state_base_dir = /var/lib/%(self_name)s
state_files_dir = %(state_base_dir)s/state
state_file_retention = 50
state_file_name_prefix = state-
state_file_name_suffix = .log
mvw_endpoint = http://localhost:8000/api/query
title_dedup_winner = first
dl_progress_update_interval = 10
dl_threads = 2
dl_filename_pattern = &(channel)s - &(publish_date)s - &(topic)s - &(title)s.&(ext)s
publish_date_srtftime_pattern = %%Y%%m%%d
dl_filename_replace_spaces_with =
dl_filename_all_lowercase = no
[maus]
min_duration = 1200
max_duration = 3000
query = @maus-query.json.example
title_not_regex = audiodeskription|gebärdensprache|hörfassung
dl_filename_pattern = &(publish_date)s.&(ext)s
publish_date_srtftime_pattern = S%%YE%%Y%%m%%d01
dl_dir = /tmp/kodi-nfo-feeder/maus

View File

@@ -0,0 +1,12 @@
[Unit]
Description=MediathekViewWeb download helper
After=multi-user.target
[Service]
Type=oneshot
RemainAfterExit=no
Environment='PATH=/usr/local/sbin:/usr/local/bin:/usr/bin'
ExecStart=/opt/miniconda3/envs/mvw-dl/bin/python /opt/python/mvw-dl/dev/mvw-dl.py
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,9 @@
[Unit]
Description=Run MediathekViewWeb download helper
[Timer]
OnCalendar=0/2:2
Persistent=true
[Install]
WantedBy=timers.target

426
mvw-dl.py
View File

@@ -3,9 +3,13 @@ import datetime as d
import json import json
import logging import logging
import os import os
import pathlib
import re import re
import shutil
import sys import sys
import time import time
import filelock
import humanize import humanize
import requests import requests
import inflect import inflect
@@ -13,6 +17,8 @@ from rich.logging import RichHandler
from rich.traceback import install from rich.traceback import install
import typing as t import typing as t
from rich.console import Console from rich.console import Console
from filelock import Timeout, FileLock
import uuid
import type_def.mvw_json_response import type_def.mvw_json_response
from type_def.mvw_json_request import MVWJSONRequest from type_def.mvw_json_request import MVWJSONRequest
@@ -28,10 +34,6 @@ from threading import Event
from typing import Iterable from typing import Iterable
from urllib.request import urlopen from urllib.request import urlopen
# TODO set locale for datetime and others to globally stick to en_US
download_start_time = 0
download_last_update_time = 0
size_downloaded = 0
from rich.progress import ( from rich.progress import (
BarColumn, BarColumn,
@@ -43,6 +45,20 @@ from rich.progress import (
TransferSpeedColumn, TransferSpeedColumn,
) )
# TODO set locale for datetime and others to globally stick to en_US
# TODO thread log messages display timestamp in systemd journal
# TODO Increment file name suffix more than once of needed
# TODO [23:15:14] DEBUG [thread]
# TODO Clean mvw-dl.timer
# TODO Reset maus-query.json
download_start_time = 0
download_last_update_time = 0
total_content_length = 0
size_downloaded_for_progress_tracking = 0
size_downloaded_for_speed_tracking = 0
file_lock_timeout = 1
state_lock_file_ext = ".lock"
progress = Progress( progress = Progress(
TextColumn("[bold blue]{task.fields[filename]}", justify="right"), TextColumn("[bold blue]{task.fields[filename]}", justify="right"),
BarColumn(bar_width=None), BarColumn(bar_width=None),
@@ -71,6 +87,9 @@ JSONType = t.Union[str, int, float, bool, None, t.Dict[str, t.Any], t.List[t.Any
# 1: Config file invalid, it has no sections # 1: Config file invalid, it has no sections
# 2: Config file invalid, sections must define at least CONST.CFG_MANDATORY # 2: Config file invalid, sections must define at least CONST.CFG_MANDATORY
# 3: No search results to download # 3: No search results to download
# 4: State file already exists, has more than 0 bytes size but doesn't contain usable JSON
# 5: State file lock cannot be acquired within file_lock_timeout
# 6: Unable to create state directory
class CONST(object): class CONST(object):
@@ -90,7 +109,11 @@ class CONST(object):
{"key": "mvw_endpoint", "value": "http://localhost:8000/api/query"}, {"key": "mvw_endpoint", "value": "http://localhost:8000/api/query"},
{"key": "title_dedup_winner", "value": "first"}, {"key": "title_dedup_winner", "value": "first"},
{"key": "dl_progress_update_interval", "value": "10"}, {"key": "dl_progress_update_interval", "value": "10"},
{"key": "dl_threads", "value": "2"} {"key": "dl_threads", "value": "2"},
{"key": "dl_filename_pattern", "value": "&(channel)s - &(publish_date)s - &(topic)s - &(title)s"},
{"key": "publish_date_srtftime_pattern", "value": "%%Y%%m%%d"},
{"key": "dl_filename_replace_spaces_with", "value": "_"},
{"key": "dl_filename_all_lowercase", "value": "yes"}
] ]
CFG_KNOWN_SECTION = [ CFG_KNOWN_SECTION = [
{"key": "min_duration", "is_mandatory": False}, {"key": "min_duration", "is_mandatory": False},
@@ -109,16 +132,16 @@ logging.basicConfig(
format=CONST.LOG_FORMAT, format=CONST.LOG_FORMAT,
datefmt="[%X]", datefmt="[%X]",
handlers=[RichHandler( handlers=[RichHandler(
show_time=False if "SYSTEMD_EXEC_PID" in os.environ else True, show_time=False,
rich_tracebacks=True rich_tracebacks=True
)] )]
) )
log = logging.getLogger("rich") log = logging.getLogger("rich")
# Our own code logs with this level # Our own code logs with this level
log.setLevel(logging.DEBUG) log.setLevel(logging.DEBUG)
# connectionpool logs with WARNING, we don't need its verbosity # connectionpool and filelock log with WARNING, we don't need its verbosity
log_connectionpool = logging.getLogger("urllib3.connectionpool") logging.getLogger("urllib3.connectionpool").setLevel(logging.WARNING)
log_connectionpool.setLevel(logging.WARNING) logging.getLogger("filelock").setLevel(logging.WARNING)
install(show_locals=True) install(show_locals=True)
@@ -142,7 +165,7 @@ class ConfigParser(
ini_defaults = [] ini_defaults = []
internal_defaults = {default["key"]: default["value"] for default in CONST.CFG_KNOWN_DEFAULTS} internal_defaults = {default["key"]: default["value"] for default in CONST.CFG_KNOWN_DEFAULTS}
config = ConfigParser(defaults=internal_defaults) config = ConfigParser(defaults=internal_defaults)
config.read(CONST.CFG_DEFAULT_FILENAME) config.read(CONST.CFG_DEFAULT_ABS_PATH)
def print_section_header( def print_section_header(
@@ -210,7 +233,8 @@ def validate_config_sections(
def query_string_from_file( def query_string_from_file(
filename: str) -> str: filename: str) -> str:
with open(filename, "r") as jsonfile: filename_abs_path = os.path.join(CONST.CFG_THIS_FILE_DIRNAME, filename)
with open(filename_abs_path, "r", encoding="utf-8") as jsonfile:
query_string = jsonfile.read() query_string = jsonfile.read()
return query_string return query_string
@@ -335,27 +359,209 @@ def dedup_json_titles(
return json_obj return json_obj
done_event = Event() def expanded_dest_dir(
raw_dest_dir: str) -> str:
user_expanded_dest_dir = os.path.expanduser(raw_dest_dir)
all_expanded_dest_dir = os.path.expandvars(user_expanded_dest_dir)
return all_expanded_dest_dir
def handle_sigint(signum, frame): def filename_replace_pattern(
done_event.set() section_name: str,
config_obj: configparser.ConfigParser(),
show: type_def.mvw_json_response.Show,
max_quality_url: str,
shorthand_uuid: str) -> str:
filename = config_obj.get(section_name, "dl_filename_pattern")
ext = pathlib.Path(max_quality_url).suffix.lstrip(".")
publish_date = d.datetime.utcfromtimestamp(show.timestamp).strftime(
config_obj.get(section_name, "publish_date_srtftime_pattern"))
show_extended = {"ext": ext, "publish_date": publish_date}
show_attrs = [attr for attr in dir(show) if not attr.startswith('_') and not callable(getattr(show, attr))]
signal.signal(signal.SIGINT, handle_sigint) for attr in show_attrs:
attr_re = re.compile(r"&\(" + re.escape(attr) + r"\)s")
if re.search(attr_re, filename):
log.debug(f"{shorthand_uuid} Replacing filename pattern '&({attr})s' ...")
filename = re.sub(attr_re, str(getattr(show, attr)), filename)
log.debug(f"{shorthand_uuid} New filename: '{filename}'")
for extended_attr in show_extended:
extended_attr_re = re.compile(r"&\(" + re.escape(extended_attr) + r"\)s")
if re.search(extended_attr_re, filename):
log.debug(f"{shorthand_uuid} Replacing filename pattern '&({extended_attr})s' ...")
filename = re.sub(extended_attr_re, show_extended[extended_attr], filename)
log.debug(f"{shorthand_uuid} New filename: '{filename}'")
return filename
def get_safe_filename( def get_safe_filename(
dirty_filename: str) -> str: dirty_filename: str,
shorthand_uuid: str) -> str:
"""https://stackoverflow.com/a/71199182""" """https://stackoverflow.com/a/71199182"""
clean_filename = re.sub(r"[/\\?%*:|\"<>\x7F\x00-\x1F]", "-", dirty_filename) log.debug(f"{shorthand_uuid} Removing question marks from file name ...")
clean_filename = re.sub(r"""[?]""", "", dirty_filename)
log.debug(f"{shorthand_uuid} Replacing unsafe characters in filename with dashes ...")
clean_filename = re.sub(r"""[/\\?%*:|"<>\x7F\x00-\x1F]""", "-", clean_filename)
log.debug(f"{shorthand_uuid} New filename: '{clean_filename}'")
return clean_filename return clean_filename
def filename_replace_spaces_with_underscores(
section_name: str,
config_obj: configparser.ConfigParser(),
filename: str,
shorthand_uuid: str) -> str:
space_replace_string = config_obj.get(section_name, "dl_filename_replace_spaces_with")
log.debug(f"{shorthand_uuid} Replacing space characters with '{space_replace_string}' ...")
underscored_filename = re.sub(
r"\s",
space_replace_string,
filename)
log.debug(f"{shorthand_uuid} New filename: '{underscored_filename}'")
return underscored_filename
def get_filename(
section_name: str,
config_obj: configparser.ConfigParser(),
show: type_def.mvw_json_response.Show,
max_quality_url: str,
shorthand_uuid: str) -> str:
log.debug(f"{shorthand_uuid} Generating final filename ...")
filename_replaced_patterns = filename_replace_pattern(
section_name,
config_obj,
show,
max_quality_url,
shorthand_uuid)
filename_safe = get_safe_filename(
filename_replaced_patterns,
shorthand_uuid)
if config.get(section_name, "dl_filename_replace_spaces_with"):
filename_safe = filename_replace_spaces_with_underscores(
section_name,
config_obj,
filename_safe,
shorthand_uuid)
if config.getboolean(section_name, "dl_filename_all_lowercase"):
log.debug(f"{shorthand_uuid} Lowercasing all filename characters ...")
filename_safe = filename_safe.lower()
log.debug(f"{shorthand_uuid} New filename: '{filename_safe}'")
return filename_safe
def get_state_file_abs_path(
section_name: str,
config_obj: configparser.ConfigParser()) -> str:
state_dir = config_obj.get(section_name, "state_files_dir")
try:
os.makedirs(state_dir, exist_ok=True)
except OSError:
log.error(f"Unable to create '[{section}]' state directory '{state_dir}'. "
f"We're not going to be able to log state information. Exiting 6 ...")
sys.exit(6)
else:
state_file = \
config_obj.get(section_name, "state_file_name_prefix") + \
section_name + \
config_obj.get(section_name, "state_file_name_suffix")
state_file_abs_path = os.path.join(state_dir, state_file)
return state_file_abs_path
def state_file_none_or_valid_json(
state_file_abs_path: str) -> bool:
if os.path.exists(state_file_abs_path):
if os.path.getsize(state_file_abs_path) > 0:
with open(state_file_abs_path, "r", encoding="utf-8") as state_file:
try:
json.loads(state_file.read())
return True
except json.JSONDecodeError:
log.warning(f"State file '{state_file_abs_path}' does not contain valid JSON. We're not going to "
f"be able to log anything into it. Exiting 4 ...")
sys.exit(4)
else:
return True
else:
return True
def truncate_log(
json_data: json.loads,
max_log_entries: int) -> json.loads:
for i in range(len(json_data)):
del json_data[i]
if len(json_data) <= max_log_entries:
break
return json_data
def get_state_file_lock(
state_lock_file: str) -> filelock.BaseFileLock:
global file_lock_timeout
try:
lock = FileLock(state_lock_file, timeout=file_lock_timeout)
return lock
except filelock.Timeout:
log.error(f"Unable to acquire lock on state lock file '{state_lock_file}' "
f"""within {file_lock_timeout} {p.plural("second", file_lock_timeout)}, exiting 5 ...""")
sys.exit(5)
def log_successful_download( def log_successful_download(
show: type_def.mvw_json_response.Show) -> None: section_name: str,
pass config_obj: configparser.ConfigParser(),
show: type_def.mvw_json_response.Show,
state_file_abs_path: str,
job_uuid: str,
shorthand_uuid: str) -> None:
timestamp_now = int(time.time())
os.makedirs(os.path.dirname(state_file_abs_path), exist_ok=True)
state_lock_file = state_file_abs_path + state_lock_file_ext
state_body = show.dict(include={"topic", "title"})
state_body["dl_complete_timestamp_epoch"] = timestamp_now
state_body["dl_complete_timestamp_human"] = \
d.datetime.utcfromtimestamp(timestamp_now).strftime("%Y-%m-%d %H%M%S UTC")
state_entry = {job_uuid: state_body}
json_state = None
lock = get_state_file_lock(state_lock_file)
with lock:
state_file_none_or_valid_json(state_file_abs_path)
state_file_open_mode = "r+" if os.path.exists(state_file_abs_path) else "w+"
with open(state_file_abs_path, state_file_open_mode, encoding="utf-8") as state_file:
try:
json_state = json.load(state_file)
except json.JSONDecodeError:
if json_state is None:
state_file.truncate()
json_state = []
log.debug(f"{shorthand_uuid} Writing log entry to '{state_file_abs_path}' ...")
with open(state_file_abs_path, "w", encoding="utf-8") as state_file:
json_state.append(state_entry)
max_log_entries = config_obj.getint(section_name, "state_file_retention")
if len(json_state) > max_log_entries:
json_state = truncate_log(json_state, max_log_entries)
json.dump(json_state, state_file, indent=4, sort_keys=True, ensure_ascii=False)
def copy_url( def copy_url(
@@ -363,46 +569,77 @@ def copy_url(
config_obj: configparser.ConfigParser(), config_obj: configparser.ConfigParser(),
show: type_def.mvw_json_response.Show, show: type_def.mvw_json_response.Show,
video_metadata: dict, video_metadata: dict,
total_content_length: int) -> None: state_file_abs_path: str,
show_name: str,
job_uuid: str,
shorthand_uuid: str,
tmp_dir: str,
dest_dir: str) -> None:
"""Copy data from a url to a local file.""" """Copy data from a url to a local file."""
global download_start_time global download_start_time
global download_last_update_time global download_last_update_time
global size_downloaded global size_downloaded_for_progress_tracking
global size_downloaded_for_speed_tracking
update_interval = config_obj.getint(section_name, "dl_progress_update_interval") update_interval = config_obj.getint(section_name, "dl_progress_update_interval")
max_quality_url = video_metadata["url"] max_quality_url = video_metadata["url"]
filename = max_quality_url.split("/")[-1] filename = get_filename(section_name, config_obj, show, max_quality_url, shorthand_uuid)
dest_dir = config_obj.get(section_name, "dl_dir") resume_header = {}
tmp_file_open_mode = "wb"
tmp_file_size = 0
tmp_path = os.path.join(tmp_dir, filename)
dest_path = os.path.join(dest_dir, filename) dest_path = os.path.join(dest_dir, filename)
dest_path = os.path.expanduser(dest_path)
dest_path = os.path.expandvars(dest_path)
show_name = f"{show.topic} - {show.title}"
publish_date = d.datetime.utcfromtimestamp(show.timestamp).strftime('%Y%m%d')
os.makedirs(os.path.dirname(dest_path), exist_ok=True) os.makedirs(os.path.dirname(tmp_path), exist_ok=True)
with open(dest_path, "wb") as dest_file: log.info(f"{shorthand_uuid} Download location resolved to '{tmp_path}'")
log.info(f"""Downloading "{show_name}" ...""") if os.path.exists(tmp_path):
log.info(f"Download location resolved to {dest_path}") tmp_file_size = os.path.getsize(tmp_path)
r = requests.get(max_quality_url, stream=True) log.debug(f"{shorthand_uuid} Temporary file '{tmp_path}' exists likely from a previous incomplete "
for chunk in r.iter_content(32768): f"download attempt, size is {humanize.naturalsize(tmp_file_size, binary=True)}. Resuming ...")
size_downloaded += len(chunk) tmp_file_open_mode = "ab"
dest_file.write(chunk) try:
if time.time() - download_last_update_time >= update_interval: with open(tmp_path, tmp_file_open_mode) as tmp_file:
download_last_update_time = time.time() log.info(f"""{shorthand_uuid} Downloading "{show_name}" ...""")
dl_speed_so_far = size_downloaded / (download_last_update_time - download_start_time) if tmp_file_size > 0:
human_dl_speed_so_far = f"{humanize.naturalsize(dl_speed_so_far, binary=True)}/s" resume_header = {"range": f"bytes={tmp_file_size}-"}
percentage_done = size_downloaded / total_content_length * 100 log.debug(f"resume_header: {resume_header}")
human_pct = "{:.1f}".format(percentage_done) size_downloaded_for_progress_tracking += tmp_file_size
human_size_dl = humanize.naturalsize(size_downloaded, binary=True) r = requests.get(max_quality_url, headers=resume_header, stream=True)
human_total_dl = humanize.naturalsize(total_content_length, binary=True) for chunk in r.iter_content(32768):
log.debug(f"Downloaded {human_pct}% ({human_size_dl}/{human_total_dl} at an average " size_downloaded_for_progress_tracking += len(chunk)
f"{human_dl_speed_so_far})") size_downloaded_for_speed_tracking += len(chunk)
if done_event.is_set(): tmp_file.write(chunk)
log.info(f"""Download of "{show_name}" interrupted""") if time.time() - download_last_update_time >= update_interval:
return download_last_update_time = time.time()
log.info(f"""Download of "{show_name}" done""") time_in_progress = download_last_update_time - download_start_time
log_successful_download(show) dl_speed_so_far = size_downloaded_for_speed_tracking / time_in_progress
human_dl_speed_so_far = f"{humanize.naturalsize(dl_speed_so_far, binary=True)}/s"
data_missing = total_content_length - size_downloaded_for_progress_tracking
time_til_completion = 1 / dl_speed_so_far * data_missing
human_time_til_completion = humanize.naturaldelta(d.timedelta(seconds=time_til_completion))
percentage_done = size_downloaded_for_progress_tracking / total_content_length * 100
human_pct = "{:.1f}".format(percentage_done)
human_size_dl = humanize.naturalsize(size_downloaded_for_progress_tracking, binary=True)
human_total_dl = humanize.naturalsize(total_content_length, binary=True)
log.debug(f"[thread] Downloaded {human_pct}% ({human_size_dl}/{human_total_dl} "
f"at an average {human_dl_speed_so_far}, approximately {human_time_til_completion} "
f"left til completion.)")
log.info(f"""{shorthand_uuid} Download of "{show_name}" done""")
except IOError:
log.error(f"{shorthand_uuid} IOError during download. Aborting this download thread ...")
return
log.info(f"{shorthand_uuid} Moving file to final location '{dest_path}' ...")
try:
shutil.move(tmp_path, dest_path)
except OSError as ose:
log.error(f"{shorthand_uuid} Failed moving file with an OSError\n"
f"{ose}\n"
f"Other threads continue unhindered.")
else:
log_successful_download(section_name, config_obj, show, state_file_abs_path, job_uuid, shorthand_uuid)
log.info(f"{shorthand_uuid} Done moving")
def get_max_quality_url( def get_max_quality_url(
@@ -425,6 +662,33 @@ def get_content_length(
return 0 return 0
def get_json_state(
state_file_abs_path: str) -> json.loads:
try:
with open(state_file_abs_path, "r", encoding="utf-8") as state_file:
try:
json_state = json.load(state_file)
except json.JSONDecodeError:
return []
else:
return json_state
except FileNotFoundError:
log.debug(f"State file does not exist (yet), assuming no previous downloads have ever happened ...")
return []
def is_already_downloaded(
show: type_def.mvw_json_response.Show,
json_state: json.loads,
show_name: str) -> bool:
for log_entry in json_state:
for log_data in [key for key in log_entry]:
if show.topic == log_entry[log_data]["topic"] and show.title == log_entry[log_data]["title"]:
log.debug(f"""Show "{show_name}" already downloaded, won't queue""")
return True
def download_media( def download_media(
section_name: str, section_name: str,
config_obj: configparser.ConfigParser(), config_obj: configparser.ConfigParser(),
@@ -432,33 +696,59 @@ def download_media(
global download_start_time global download_start_time
global download_last_update_time global download_last_update_time
global total_content_length
dl_threads = config_obj.getint(section_name, "dl_threads") dl_threads = config_obj.getint(section_name, "dl_threads")
state_file_abs_path = get_state_file_abs_path(section_name, config_obj)
state_lock_file = state_file_abs_path + state_lock_file_ext
video_metadata = {} video_metadata = {}
for result in json_obj.result.results.copy(): tmp_dir = expanded_dest_dir(config_obj.get(section_name, "tmp_base_dir"))
max_quality_url = get_max_quality_url(result) dest_dir = expanded_dest_dir(config_obj.get(section_name, "dl_dir"))
content_length = get_content_length(max_quality_url) log.info(f"""Download location is {tmp_dir}""")
video_metadata[result.id] = {"url": max_quality_url, "content_length": content_length} log.info(f"""Final location is {dest_dir}""")
total_content_length = 0
for video in video_metadata:
total_content_length += video_metadata[video]["content_length"]
video_metadata["total_content_length"] = total_content_length
log.info(f"""Download location is {config_obj.get(section_name, "dl_dir")}""")
log.info(f"Limiting parallel downloads to {dl_threads} ...") log.info(f"Limiting parallel downloads to {dl_threads} ...")
lock = get_state_file_lock(state_lock_file)
with lock:
state_file_none_or_valid_json(state_file_abs_path)
json_state = get_json_state(state_file_abs_path)
with ThreadPoolExecutor(max_workers=dl_threads) as pool: with ThreadPoolExecutor(max_workers=dl_threads) as pool:
download_last_update_time = time.time() download_last_update_time = time.time()
download_start_time = download_last_update_time download_start_time = download_last_update_time
update_interval = config_obj.getint(section_name, "dl_progress_update_interval") update_interval = config_obj.getint(section_name, "dl_progress_update_interval")
log.debug(f"""Will provide updates every {update_interval} {p.plural("second", update_interval)}""") log.debug(f"""Will provide updates every {update_interval} {p.plural("second", update_interval)}""")
for result in json_obj.result.results.copy(): for result in json_obj.result.results.copy():
pool.submit( show_name = f"{result.topic} - {result.title}"
copy_url, future = None
section_name, if not is_already_downloaded(result, json_state, show_name):
config_obj, max_quality_url = get_max_quality_url(result)
result, content_length = get_content_length(max_quality_url)
video_metadata[result.id], video_metadata[result.id] = {"url": max_quality_url, "content_length": content_length}
video_metadata["total_content_length"]) total_content_length += video_metadata[result.id]["content_length"]
log.debug(f"Total download size upped to "
f"{humanize.naturalsize(total_content_length, binary=True)}")
job_uuid = str(uuid.uuid4())
shorthand_uuid = f"[{job_uuid[:2]}..{job_uuid[-2:]}]"
log.debug(f"{shorthand_uuid} Job UUID {job_uuid} generated, shorthand is {shorthand_uuid}")
log.debug(f"""{shorthand_uuid} Queuing "{show_name}" for download ...""")
future = pool.submit(
copy_url,
section_name,
config_obj,
result,
video_metadata[result.id],
state_file_abs_path,
show_name,
job_uuid,
shorthand_uuid,
tmp_dir,
dest_dir)
if future is not None:
future.result()
if __name__ == '__main__': if __name__ == '__main__':
@@ -488,8 +778,6 @@ if __name__ == '__main__':
if config.has_option(section, "title_not_regex"): if config.has_option(section, "title_not_regex"):
json_response = dedup_json_titles(section, config, json_response) json_response = dedup_json_titles(section, config, json_response)
log.debug(f"Downloading {json_response.result.queryInfo.resultCount} " log.debug(f"Interested in {json_response.result.queryInfo.resultCount} "
f"""{p.plural("show", json_response.result.queryInfo.resultCount)} ...""") f"""{p.plural("show", json_response.result.queryInfo.resultCount)} ...""")
download_media(section, config, json_response) download_media(section, config, json_response)
# console.print_json(json_response.json())

View File

@@ -3,3 +3,4 @@ requests
inflect inflect
pydantic pydantic
humanize humanize
filelock

View File

@@ -10,6 +10,8 @@ charset-normalizer==2.0.12
# via requests # via requests
commonmark==0.9.1 commonmark==0.9.1
# via rich # via rich
filelock==3.6.0
# via -r requirements.in
humanize==4.0.0 humanize==4.0.0
# via -r requirements.in # via -r requirements.in
idna==3.3 idna==3.3
@@ -26,5 +28,5 @@ rich==12.0.0
# via -r requirements.in # via -r requirements.in
typing-extensions==4.1.1 typing-extensions==4.1.1
# via pydantic # via pydantic
urllib3==1.26.8 urllib3==1.26.9
# via requests # via requests