Compare commits

..

23 Commits

Author SHA1 Message Date
eabf595ff5 systemd timer unit example has on 'OnCalendar' instruction 2022-03-26 23:12:42 +01:00
ab0a82c626 Hide log timestamps, intended use case is inside a systemd service unit anyway where systemd provides timestamps 2022-03-26 23:11:53 +01:00
03b449c768 systemd service unit will run on a timer, change unit type to oneshot and supply a timer unit file 2022-03-23 23:40:41 +01:00
e269a110a6 When cleaning file name remove question marks instead of replacing them with dashes 2022-03-23 23:39:32 +01:00
a3a375d142 Config and JSON files by default live in script's dir 2022-03-23 23:39:04 +01:00
81ce5812a6 Account for situations where a state file does not (yet) exist 2022-03-23 23:38:25 +01:00
83921912a4 Add to-dos 2022-03-23 23:37:19 +01:00
65e3ec83b1 Add example systemd service unit file 2022-03-23 15:53:31 +01:00
266d3189dc Replace our maus-query.json file with an example with sane defaults 2022-03-23 15:49:54 +01:00
563ff4d342 Replace our config.ini with a sane example 2022-03-23 15:47:33 +01:00
287a755e65 Streamline download selection 2022-03-20 02:35:10 +01:00
d70766bae0 If a partial download exists in temp dir we resume it 2022-03-20 02:34:15 +01:00
380fb4bf2e Calculate total downloadable content length after deciding which downloads we need 2022-03-20 02:33:32 +01:00
e395309011 Cosmetics, remove unnecessary lines, rewrite help texts a bit 2022-03-20 02:31:55 +01:00
029d9ffb7e When replacing filename pattern strings only log the ones we're seeing in config file 2022-03-20 02:30:23 +01:00
ec612de2dd Remove event handler from downloads 2022-03-20 02:29:22 +01:00
2905ff5c74 Fix encoding for JSON files 2022-03-20 02:28:14 +01:00
0cfe47465d Fix encoding for JSON files 2022-03-20 02:27:55 +01:00
5eff7876bc Cosmetics, lower urllib3.connectionpool log level back to WARNING 2022-03-20 02:25:39 +01:00
4809846edf Correctly calculate download speed if we're resuming 2022-03-20 02:25:02 +01:00
b5dff485d9 Move downloaded file into target location even across file system boundaries 2022-03-20 02:22:12 +01:00
e78659b2de Example JSON query uses 100 results 2022-03-20 02:17:36 +01:00
27004a5294 By default 'hörfassung' isn't needed for downloads 2022-03-20 02:16:52 +01:00
6 changed files with 175 additions and 128 deletions

View File

@@ -1,34 +0,0 @@
[DEFAULT]
self_name = mvw-dl
tmp_base_dir = /tmp/%(self_name)s
state_base_dir = /var/lib/%(self_name)s
state_files_dir = %(state_base_dir)s/state
state_file_retention = 50
state_file_name_prefix = state-
state_file_name_suffix = .log
mvw_endpoint = http://localhost:8000/api/query
title_dedup_winner = first
dl_progress_update_interval = 10
dl_threads = 2
dl_filename_pattern = &(channel)s - &(publish_date)s - &(topic)s - &(title)s.&(ext)s
publish_date_srtftime_pattern = %%Y%%m%%d
dl_filename_replace_spaces_with =
dl_filename_all_lowercase = no
[maus]
min_duration = 1200
max_duration = 2700
query = @maus-query.json
title_not_regex = audiodeskription|gebärdensprache
# dl_filename_pattern = &(publish_date)s.&(ext)s
# publish_date_srtftime_pattern = S%%YE%%Y%%m%%d01
# query = {"queries":[{"fields":["topic"],"query":"die sendung mit der maus"},{"fields":["channel"],"query":"ARD"}],"sortBy":"timestamp","sortOrder":"desc","future":false,"offset":0,"size":50}
# state_file_name = maus
# tmp_base_dir = %(tmp_base_dir)s/maus
dl_dir = ~/maus
#[test]
#min_duration = 100
#max_duration = 200
#query = {"queries":[{"fields":["topic"],"query":"die sendung mit der maus"},{"fields":["channel"],"query":"ARD"}],"sortBy":"timestamp","sortOrder":"desc","future":false,"offset":0,"size":50}
#dl_dir = test

View File

@@ -0,0 +1,25 @@
[DEFAULT]
self_name = mvw-dl
tmp_base_dir = /tmp/%(self_name)s
state_base_dir = /var/lib/%(self_name)s
state_files_dir = %(state_base_dir)s/state
state_file_retention = 50
state_file_name_prefix = state-
state_file_name_suffix = .log
mvw_endpoint = http://localhost:8000/api/query
title_dedup_winner = first
dl_progress_update_interval = 10
dl_threads = 2
dl_filename_pattern = &(channel)s - &(publish_date)s - &(topic)s - &(title)s.&(ext)s
publish_date_srtftime_pattern = %%Y%%m%%d
dl_filename_replace_spaces_with =
dl_filename_all_lowercase = no
[maus]
min_duration = 1200
max_duration = 3000
query = @maus-query.json.example
title_not_regex = audiodeskription|gebärdensprache|hörfassung
dl_filename_pattern = &(publish_date)s.&(ext)s
publish_date_srtftime_pattern = S%%YE%%Y%%m%%d01
dl_dir = /tmp/kodi-nfo-feeder/maus

View File

@@ -17,5 +17,5 @@
"sortOrder": "desc",
"future": false,
"offset": 0,
"size": 15
"size": 20
}

View File

@@ -0,0 +1,12 @@
[Unit]
Description=MediathekViewWeb download helper
After=multi-user.target
[Service]
Type=oneshot
RemainAfterExit=no
Environment='PATH=/usr/local/sbin:/usr/local/bin:/usr/bin'
ExecStart=/opt/miniconda3/envs/mvw-dl/bin/python /opt/python/mvw-dl/dev/mvw-dl.py
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,9 @@
[Unit]
Description=Run MediathekViewWeb download helper
[Timer]
OnCalendar=0/2:2
Persistent=true
[Install]
WantedBy=timers.target

173
mvw-dl.py
View File

@@ -5,6 +5,7 @@ import logging
import os
import pathlib
import re
import shutil
import sys
import time
@@ -45,9 +46,16 @@ from rich.progress import (
)
# TODO set locale for datetime and others to globally stick to en_US
# TODO thread log messages display timestamp in systemd journal
# TODO Increment file name suffix more than once of needed
# TODO [23:15:14] DEBUG [thread]
# TODO Clean mvw-dl.timer
# TODO Reset maus-query.json
download_start_time = 0
download_last_update_time = 0
size_downloaded = 0
total_content_length = 0
size_downloaded_for_progress_tracking = 0
size_downloaded_for_speed_tracking = 0
file_lock_timeout = 1
state_lock_file_ext = ".lock"
@@ -81,6 +89,7 @@ JSONType = t.Union[str, int, float, bool, None, t.Dict[str, t.Any], t.List[t.Any
# 3: No search results to download
# 4: State file already exists, has more than 0 bytes size but doesn't contain usable JSON
# 5: State file lock cannot be acquired within file_lock_timeout
# 6: Unable to create state directory
class CONST(object):
@@ -123,7 +132,7 @@ logging.basicConfig(
format=CONST.LOG_FORMAT,
datefmt="[%X]",
handlers=[RichHandler(
show_time=False if "SYSTEMD_EXEC_PID" in os.environ else True,
show_time=False,
rich_tracebacks=True
)]
)
@@ -131,7 +140,7 @@ log = logging.getLogger("rich")
# Our own code logs with this level
log.setLevel(logging.DEBUG)
# connectionpool and filelock log with WARNING, we don't need its verbosity
# logging.getLogger("urllib3.connectionpool").setLevel(logging.WARNING)
logging.getLogger("urllib3.connectionpool").setLevel(logging.WARNING)
logging.getLogger("filelock").setLevel(logging.WARNING)
install(show_locals=True)
@@ -156,7 +165,7 @@ class ConfigParser(
ini_defaults = []
internal_defaults = {default["key"]: default["value"] for default in CONST.CFG_KNOWN_DEFAULTS}
config = ConfigParser(defaults=internal_defaults)
config.read(CONST.CFG_DEFAULT_FILENAME)
config.read(CONST.CFG_DEFAULT_ABS_PATH)
def print_section_header(
@@ -224,7 +233,8 @@ def validate_config_sections(
def query_string_from_file(
filename: str) -> str:
with open(filename, "r") as jsonfile:
filename_abs_path = os.path.join(CONST.CFG_THIS_FILE_DIRNAME, filename)
with open(filename_abs_path, "r", encoding="utf-8") as jsonfile:
query_string = jsonfile.read()
return query_string
@@ -349,16 +359,6 @@ def dedup_json_titles(
return json_obj
done_event = Event()
def handle_sigint(signum, frame):
done_event.set()
signal.signal(signal.SIGINT, handle_sigint)
def expanded_dest_dir(
raw_dest_dir: str) -> str:
user_expanded_dest_dir = os.path.expanduser(raw_dest_dir)
@@ -381,13 +381,17 @@ def filename_replace_pattern(
show_attrs = [attr for attr in dir(show) if not attr.startswith('_') and not callable(getattr(show, attr))]
for attr in show_attrs:
# log.debug(f"{shorthand_uuid} Replacing filename pattern '&({attr})s' ...")
filename = re.sub(r"&\(" + re.escape(attr) + r"\)s", str(getattr(show, attr)), filename)
# log.debug(f"{shorthand_uuid} New filename: '{filename}'")
attr_re = re.compile(r"&\(" + re.escape(attr) + r"\)s")
if re.search(attr_re, filename):
log.debug(f"{shorthand_uuid} Replacing filename pattern '&({attr})s' ...")
filename = re.sub(attr_re, str(getattr(show, attr)), filename)
log.debug(f"{shorthand_uuid} New filename: '{filename}'")
for extended_attr in show_extended:
# log.debug(f"{shorthand_uuid} Replacing filename pattern '&({extended_attr})s' ...")
filename = re.sub(r"&\(" + re.escape(extended_attr) + r"\)s", show_extended[extended_attr], filename)
# log.debug(f"{shorthand_uuid} New filename: '{filename}'")
extended_attr_re = re.compile(r"&\(" + re.escape(extended_attr) + r"\)s")
if re.search(extended_attr_re, filename):
log.debug(f"{shorthand_uuid} Replacing filename pattern '&({extended_attr})s' ...")
filename = re.sub(extended_attr_re, show_extended[extended_attr], filename)
log.debug(f"{shorthand_uuid} New filename: '{filename}'")
return filename
@@ -396,8 +400,12 @@ def get_safe_filename(
shorthand_uuid: str) -> str:
"""https://stackoverflow.com/a/71199182"""
log.debug(f"{shorthand_uuid} Removing question marks from file name ...")
clean_filename = re.sub(r"""[?]""", "", dirty_filename)
log.debug(f"{shorthand_uuid} Replacing unsafe characters in filename with dashes ...")
clean_filename = re.sub(r"""[/\\?%*:|"<>\x7F\x00-\x1F]""", "-", dirty_filename)
clean_filename = re.sub(r"""[/\\?%*:|"<>\x7F\x00-\x1F]""", "-", clean_filename)
log.debug(f"{shorthand_uuid} New filename: '{clean_filename}'")
return clean_filename
@@ -450,7 +458,6 @@ def get_filename(
filename_safe = filename_safe.lower()
log.debug(f"{shorthand_uuid} New filename: '{filename_safe}'")
log.debug(f"{shorthand_uuid} {filename_safe}")
return filename_safe
@@ -459,6 +466,13 @@ def get_state_file_abs_path(
config_obj: configparser.ConfigParser()) -> str:
state_dir = config_obj.get(section_name, "state_files_dir")
try:
os.makedirs(state_dir, exist_ok=True)
except OSError:
log.error(f"Unable to create '[{section}]' state directory '{state_dir}'. "
f"We're not going to be able to log state information. Exiting 6 ...")
sys.exit(6)
else:
state_file = \
config_obj.get(section_name, "state_file_name_prefix") + \
section_name + \
@@ -472,7 +486,7 @@ def state_file_none_or_valid_json(
if os.path.exists(state_file_abs_path):
if os.path.getsize(state_file_abs_path) > 0:
with open(state_file_abs_path, "r") as state_file:
with open(state_file_abs_path, "r", encoding="utf-8") as state_file:
try:
json.loads(state_file.read())
return True
@@ -532,7 +546,8 @@ def log_successful_download(
with lock:
state_file_none_or_valid_json(state_file_abs_path)
with open(state_file_abs_path, "r+") as state_file:
state_file_open_mode = "r+" if os.path.exists(state_file_abs_path) else "w+"
with open(state_file_abs_path, state_file_open_mode, encoding="utf-8") as state_file:
try:
json_state = json.load(state_file)
except json.JSONDecodeError:
@@ -541,12 +556,12 @@ def log_successful_download(
json_state = []
log.debug(f"{shorthand_uuid} Writing log entry to '{state_file_abs_path}' ...")
with open(state_file_abs_path, "w") as state_file:
with open(state_file_abs_path, "w", encoding="utf-8") as state_file:
json_state.append(state_entry)
max_log_entries = config_obj.getint(section_name, "state_file_retention")
if len(json_state) > max_log_entries:
json_state = truncate_log(json_state, max_log_entries)
json.dump(json_state, state_file, indent=4, sort_keys=True)
json.dump(json_state, state_file, indent=4, sort_keys=True, ensure_ascii=False)
def copy_url(
@@ -554,7 +569,6 @@ def copy_url(
config_obj: configparser.ConfigParser(),
show: type_def.mvw_json_response.Show,
video_metadata: dict,
total_content_length: int,
state_file_abs_path: str,
show_name: str,
job_uuid: str,
@@ -565,12 +579,14 @@ def copy_url(
global download_start_time
global download_last_update_time
global size_downloaded
global size_downloaded_for_progress_tracking
global size_downloaded_for_speed_tracking
update_interval = config_obj.getint(section_name, "dl_progress_update_interval")
max_quality_url = video_metadata["url"]
filename = get_filename(section_name, config_obj, show, max_quality_url, shorthand_uuid)
resume_header = {}
tmp_file_open_mode = "wb"
tmp_file_size = 0
tmp_path = os.path.join(tmp_dir, filename)
dest_path = os.path.join(dest_dir, filename)
@@ -581,29 +597,34 @@ def copy_url(
tmp_file_size = os.path.getsize(tmp_path)
log.debug(f"{shorthand_uuid} Temporary file '{tmp_path}' exists likely from a previous incomplete "
f"download attempt, size is {humanize.naturalsize(tmp_file_size, binary=True)}. Resuming ...")
tmp_file_open_mode = "ab"
try:
with open(tmp_path, "wb") as tmp_file:
with open(tmp_path, tmp_file_open_mode) as tmp_file:
log.info(f"""{shorthand_uuid} Downloading "{show_name}" ...""")
if tmp_file_size > 0:
resume_header = {"range": f"bytes={tmp_file_size}-"}
log.info(f"resume_header: {resume_header}")
log.debug(f"resume_header: {resume_header}")
size_downloaded_for_progress_tracking += tmp_file_size
r = requests.get(max_quality_url, headers=resume_header, stream=True)
for chunk in r.iter_content(32768):
size_downloaded += len(chunk)
size_downloaded_for_progress_tracking += len(chunk)
size_downloaded_for_speed_tracking += len(chunk)
tmp_file.write(chunk)
if time.time() - download_last_update_time >= update_interval:
download_last_update_time = time.time()
dl_speed_so_far = size_downloaded / (download_last_update_time - download_start_time)
time_in_progress = download_last_update_time - download_start_time
dl_speed_so_far = size_downloaded_for_speed_tracking / time_in_progress
human_dl_speed_so_far = f"{humanize.naturalsize(dl_speed_so_far, binary=True)}/s"
percentage_done = size_downloaded / total_content_length * 100
data_missing = total_content_length - size_downloaded_for_progress_tracking
time_til_completion = 1 / dl_speed_so_far * data_missing
human_time_til_completion = humanize.naturaldelta(d.timedelta(seconds=time_til_completion))
percentage_done = size_downloaded_for_progress_tracking / total_content_length * 100
human_pct = "{:.1f}".format(percentage_done)
human_size_dl = humanize.naturalsize(size_downloaded, binary=True)
human_size_dl = humanize.naturalsize(size_downloaded_for_progress_tracking, binary=True)
human_total_dl = humanize.naturalsize(total_content_length, binary=True)
log.debug(f"[thread] Downloaded {human_pct}% ({human_size_dl}/{human_total_dl} "
f"at an average {human_dl_speed_so_far})")
if done_event.is_set():
log.info(f"""{shorthand_uuid} Download of "{show_name}" interrupted""")
return
f"at an average {human_dl_speed_so_far}, approximately {human_time_til_completion} "
f"left til completion.)")
log.info(f"""{shorthand_uuid} Download of "{show_name}" done""")
except IOError:
log.error(f"{shorthand_uuid} IOError during download. Aborting this download thread ...")
@@ -611,12 +632,14 @@ def copy_url(
log.info(f"{shorthand_uuid} Moving file to final location '{dest_path}' ...")
try:
os.rename(tmp_path, dest_path)
shutil.move(tmp_path, dest_path)
except OSError as ose:
log.error(f"{shorthand_uuid} Failed moving file with an OSError\n"
f"{ose}\n"
f"Other threads continue unhindered.")
else:
log_successful_download(section_name, config_obj, show, state_file_abs_path, job_uuid, shorthand_uuid)
log.info(f"{shorthand_uuid} Done moving")
except Exception:
console.print_exception(show_locals=True)
log.error(f"{shorthand_uuid} Failed moving file")
def get_max_quality_url(
@@ -639,22 +662,31 @@ def get_content_length(
return 0
def is_already_downloaded(
show: type_def.mvw_json_response.Show,
state_file_abs_path: str,
show_name: str) -> bool:
with open(state_file_abs_path, "r") as state_file:
def get_json_state(
state_file_abs_path: str) -> json.loads:
try:
with open(state_file_abs_path, "r", encoding="utf-8") as state_file:
try:
json_state = json.load(state_file)
except json.JSONDecodeError:
return []
else:
return json_state
except FileNotFoundError:
log.debug(f"State file does not exist (yet), assuming no previous downloads have ever happened ...")
return []
def is_already_downloaded(
show: type_def.mvw_json_response.Show,
json_state: json.loads,
show_name: str) -> bool:
for log_entry in json_state:
for log_data in [key for key in log_entry]:
if show.topic == log_entry[log_data]["topic"] and show.title == log_entry[log_data]["title"]:
log.debug(f"""Show "{show_name}" already downloaded, won't queue""")
return True
except json.JSONDecodeError:
return False
return False
def download_media(
@@ -664,20 +696,13 @@ def download_media(
global download_start_time
global download_last_update_time
global total_content_length
dl_threads = config_obj.getint(section_name, "dl_threads")
state_file_abs_path = get_state_file_abs_path(section_name, config_obj)
state_lock_file = state_file_abs_path + state_lock_file_ext
video_metadata = {}
for result in json_obj.result.results.copy():
max_quality_url = get_max_quality_url(result)
content_length = get_content_length(max_quality_url)
video_metadata[result.id] = {"url": max_quality_url, "content_length": content_length}
total_content_length = 0
for video in video_metadata:
total_content_length += video_metadata[video]["content_length"]
video_metadata["total_content_length"] = total_content_length
tmp_dir = expanded_dest_dir(config_obj.get(section_name, "tmp_base_dir"))
dest_dir = expanded_dest_dir(config_obj.get(section_name, "dl_dir"))
log.info(f"""Download location is {tmp_dir}""")
@@ -686,32 +711,44 @@ def download_media(
lock = get_state_file_lock(state_lock_file)
with lock:
state_file_none_or_valid_json(state_file_abs_path)
json_state = get_json_state(state_file_abs_path)
with ThreadPoolExecutor(max_workers=dl_threads) as pool:
download_last_update_time = time.time()
download_start_time = download_last_update_time
update_interval = config_obj.getint(section_name, "dl_progress_update_interval")
log.debug(f"""Will provide updates every {update_interval} {p.plural("second", update_interval)}""")
with lock:
state_file_none_or_valid_json(state_file_abs_path)
for result in json_obj.result.results.copy():
show_name = f"{result.topic} - {result.title}"
if not is_already_downloaded(result, state_file_abs_path, show_name):
future = None
if not is_already_downloaded(result, json_state, show_name):
max_quality_url = get_max_quality_url(result)
content_length = get_content_length(max_quality_url)
video_metadata[result.id] = {"url": max_quality_url, "content_length": content_length}
total_content_length += video_metadata[result.id]["content_length"]
log.debug(f"Total download size upped to "
f"{humanize.naturalsize(total_content_length, binary=True)}")
job_uuid = str(uuid.uuid4())
shorthand_uuid = f"[{job_uuid[:2]}..{job_uuid[-2:]}]"
log.debug(f"""Queuing "{show_name}" for download ...""")
pool.submit(
log.debug(f"{shorthand_uuid} Job UUID {job_uuid} generated, shorthand is {shorthand_uuid}")
log.debug(f"""{shorthand_uuid} Queuing "{show_name}" for download ...""")
future = pool.submit(
copy_url,
section_name,
config_obj,
result,
video_metadata[result.id],
video_metadata["total_content_length"],
state_file_abs_path,
show_name,
job_uuid,
shorthand_uuid,
tmp_dir,
dest_dir)
if future is not None:
future.result()
if __name__ == '__main__':
@@ -741,8 +778,6 @@ if __name__ == '__main__':
if config.has_option(section, "title_not_regex"):
json_response = dedup_json_titles(section, config, json_response)
log.debug(f"Downloading {json_response.result.queryInfo.resultCount} "
log.debug(f"Interested in {json_response.result.queryInfo.resultCount} "
f"""{p.plural("show", json_response.result.queryInfo.resultCount)} ...""")
download_media(section, config, json_response)
# console.print_json(json_response.json())