Streamline download selection

If a partial download exists in temp dir we resume it
Calculate total downloadable content length after deciding which downloads we need
2022-03-20 02:35:10 +01:00 · 2022-03-20 02:34:15 +01:00 · 2022-03-20 02:33:32 +01:00 · 2022-03-20 02:31:55 +01:00 · 2022-03-20 02:30:23 +01:00 · 2022-03-20 02:29:22 +01:00
3 changed files with 100 additions and 86 deletions
--- a/config.ini
+++ b/config.ini
@@ -19,7 +19,7 @@ dl_filename_all_lowercase = no
 min_duration = 1200
 max_duration = 2700
 query = @maus-query.json
-title_not_regex = audiodeskription|gebärdensprache
+title_not_regex = audiodeskription|gebärdensprache|hörfassung
 # dl_filename_pattern = &(publish_date)s.&(ext)s
 # publish_date_srtftime_pattern = S%%YE%%Y%%m%%d01
 # query = {"queries":[{"fields":["topic"],"query":"die sendung mit der maus"},{"fields":["channel"],"query":"ARD"}],"sortBy":"timestamp","sortOrder":"desc","future":false,"offset":0,"size":50}
--- a/maus-query.json
+++ b/maus-query.json
@@ -17,5 +17,5 @@
    "sortOrder": "desc",
    "future": false,
    "offset": 0,
-    "size": 15
+    "size": 100
 }
--- a/mvw-dl.py
+++ b/mvw-dl.py
@@ -5,6 +5,7 @@ import logging
 import os
 import pathlib
 import re
+import shutil
 import sys
 import time

@@ -47,7 +48,9 @@ from rich.progress import (
 # TODO set locale for datetime and others to globally stick to en_US
 download_start_time = 0
 download_last_update_time = 0
-size_downloaded = 0
+total_content_length = 0
+size_downloaded_for_progress_tracking = 0
+size_downloaded_for_speed_tracking = 0
 file_lock_timeout = 1
 state_lock_file_ext = ".lock"

@@ -131,7 +134,7 @@ log = logging.getLogger("rich")
 # Our own code logs with this level
 log.setLevel(logging.DEBUG)
 # connectionpool and filelock log with WARNING, we don't need its verbosity
-# logging.getLogger("urllib3.connectionpool").setLevel(logging.WARNING)
+logging.getLogger("urllib3.connectionpool").setLevel(logging.WARNING)
 logging.getLogger("filelock").setLevel(logging.WARNING)
 install(show_locals=True)

@@ -224,7 +227,7 @@ def validate_config_sections(

 def query_string_from_file(
        filename: str) -> str:
-    with open(filename, "r") as jsonfile:
+    with open(filename, "r", encoding="utf-8") as jsonfile:
        query_string = jsonfile.read()
        return query_string

@@ -349,16 +352,6 @@ def dedup_json_titles(
    return json_obj


-done_event = Event()
-
-
-def handle_sigint(signum, frame):
-    done_event.set()
-
-
-signal.signal(signal.SIGINT, handle_sigint)
-
-
 def expanded_dest_dir(
        raw_dest_dir: str) -> str:
    user_expanded_dest_dir = os.path.expanduser(raw_dest_dir)
@@ -381,13 +374,17 @@ def filename_replace_pattern(
    show_attrs = [attr for attr in dir(show) if not attr.startswith('_') and not callable(getattr(show, attr))]

    for attr in show_attrs:
-        # log.debug(f"{shorthand_uuid} Replacing filename pattern '&({attr})s' ...")
-        filename = re.sub(r"&\(" + re.escape(attr) + r"\)s", str(getattr(show, attr)), filename)
-        # log.debug(f"{shorthand_uuid} New filename: '{filename}'")
+        attr_re = re.compile(r"&\(" + re.escape(attr) + r"\)s")
+        if re.search(attr_re, filename):
+            log.debug(f"{shorthand_uuid} Replacing filename pattern '&({attr})s' ...")
+            filename = re.sub(attr_re, str(getattr(show, attr)), filename)
+            log.debug(f"{shorthand_uuid} New filename: '{filename}'")
    for extended_attr in show_extended:
-        # log.debug(f"{shorthand_uuid} Replacing filename pattern '&({extended_attr})s' ...")
-        filename = re.sub(r"&\(" + re.escape(extended_attr) + r"\)s", show_extended[extended_attr], filename)
-        # log.debug(f"{shorthand_uuid} New filename: '{filename}'")
+        extended_attr_re = re.compile(r"&\(" + re.escape(extended_attr) + r"\)s")
+        if re.search(extended_attr_re, filename):
+            log.debug(f"{shorthand_uuid} Replacing filename pattern '&({extended_attr})s' ...")
+            filename = re.sub(extended_attr_re, show_extended[extended_attr], filename)
+            log.debug(f"{shorthand_uuid} New filename: '{filename}'")
    return filename


@@ -398,6 +395,7 @@ def get_safe_filename(

    log.debug(f"{shorthand_uuid} Replacing unsafe characters in filename with dashes ...")
    clean_filename = re.sub(r"""[/\\?%*:|"<>\x7F\x00-\x1F]""", "-", dirty_filename)
+
    log.debug(f"{shorthand_uuid} New filename: '{clean_filename}'")
    return clean_filename

@@ -450,7 +448,6 @@ def get_filename(
        filename_safe = filename_safe.lower()
        log.debug(f"{shorthand_uuid} New filename: '{filename_safe}'")

-    log.debug(f"{shorthand_uuid} {filename_safe}")
    return filename_safe


@@ -472,7 +469,7 @@ def state_file_none_or_valid_json(

    if os.path.exists(state_file_abs_path):
        if os.path.getsize(state_file_abs_path) > 0:
-            with open(state_file_abs_path, "r") as state_file:
+            with open(state_file_abs_path, "r", encoding="utf-8") as state_file:
                try:
                    json.loads(state_file.read())
                    return True
@@ -532,7 +529,7 @@ def log_successful_download(

    with lock:
        state_file_none_or_valid_json(state_file_abs_path)
-        with open(state_file_abs_path, "r+") as state_file:
+        with open(state_file_abs_path, "r+", encoding="utf-8") as state_file:
            try:
                json_state = json.load(state_file)
            except json.JSONDecodeError:
@@ -541,12 +538,12 @@ def log_successful_download(
                    json_state = []

        log.debug(f"{shorthand_uuid} Writing log entry to '{state_file_abs_path}' ...")
-        with open(state_file_abs_path, "w") as state_file:
+        with open(state_file_abs_path, "w", encoding="utf-8") as state_file:
            json_state.append(state_entry)
            max_log_entries = config_obj.getint(section_name, "state_file_retention")
            if len(json_state) > max_log_entries:
                json_state = truncate_log(json_state, max_log_entries)
-            json.dump(json_state, state_file, indent=4, sort_keys=True)
+            json.dump(json_state, state_file, indent=4, sort_keys=True, ensure_ascii=False)


 def copy_url(
@@ -554,7 +551,6 @@ def copy_url(
        config_obj: configparser.ConfigParser(),
        show: type_def.mvw_json_response.Show,
        video_metadata: dict,
-        total_content_length: int,
        state_file_abs_path: str,
        show_name: str,
        job_uuid: str,
@@ -565,12 +561,14 @@ def copy_url(

    global download_start_time
    global download_last_update_time
-    global size_downloaded
+    global size_downloaded_for_progress_tracking
+    global size_downloaded_for_speed_tracking

    update_interval = config_obj.getint(section_name, "dl_progress_update_interval")
    max_quality_url = video_metadata["url"]
    filename = get_filename(section_name, config_obj, show, max_quality_url, shorthand_uuid)
    resume_header = {}
+    tmp_file_open_mode = "wb"
    tmp_file_size = 0
    tmp_path = os.path.join(tmp_dir, filename)
    dest_path = os.path.join(dest_dir, filename)
@@ -581,29 +579,34 @@ def copy_url(
        tmp_file_size = os.path.getsize(tmp_path)
        log.debug(f"{shorthand_uuid} Temporary file '{tmp_path}' exists likely from a previous incomplete "
                  f"download attempt, size is {humanize.naturalsize(tmp_file_size, binary=True)}. Resuming ...")
+        tmp_file_open_mode = "ab"
    try:
-        with open(tmp_path, "wb") as tmp_file:
+        with open(tmp_path, tmp_file_open_mode) as tmp_file:
            log.info(f"""{shorthand_uuid} Downloading "{show_name}" ...""")
            if tmp_file_size > 0:
                resume_header = {"range": f"bytes={tmp_file_size}-"}
-            log.info(f"resume_header: {resume_header}")
+                log.debug(f"resume_header: {resume_header}")
+                size_downloaded_for_progress_tracking += tmp_file_size
            r = requests.get(max_quality_url, headers=resume_header, stream=True)
            for chunk in r.iter_content(32768):
-                size_downloaded += len(chunk)
+                size_downloaded_for_progress_tracking += len(chunk)
+                size_downloaded_for_speed_tracking += len(chunk)
                tmp_file.write(chunk)
                if time.time() - download_last_update_time >= update_interval:
                    download_last_update_time = time.time()
-                    dl_speed_so_far = size_downloaded / (download_last_update_time - download_start_time)
+                    time_in_progress = download_last_update_time - download_start_time
+                    dl_speed_so_far = size_downloaded_for_speed_tracking / time_in_progress
                    human_dl_speed_so_far = f"{humanize.naturalsize(dl_speed_so_far, binary=True)}/s"
-                    percentage_done = size_downloaded / total_content_length * 100
+                    data_missing = total_content_length - size_downloaded_for_progress_tracking
+                    time_til_completion = 1 / dl_speed_so_far * data_missing
+                    human_time_til_completion = humanize.naturaldelta(d.timedelta(seconds=time_til_completion))
+                    percentage_done = size_downloaded_for_progress_tracking / total_content_length * 100
                    human_pct = "{:.1f}".format(percentage_done)
-                    human_size_dl = humanize.naturalsize(size_downloaded, binary=True)
+                    human_size_dl = humanize.naturalsize(size_downloaded_for_progress_tracking, binary=True)
                    human_total_dl = humanize.naturalsize(total_content_length, binary=True)
                    log.debug(f"[thread] Downloaded {human_pct}% ({human_size_dl}/{human_total_dl} "
-                              f"at an average {human_dl_speed_so_far})")
-                if done_event.is_set():
-                    log.info(f"""{shorthand_uuid} Download of "{show_name}" interrupted""")
-                    return
+                              f"at an average {human_dl_speed_so_far}, approximately {human_time_til_completion} "
+                              f"left til completion.)")
            log.info(f"""{shorthand_uuid} Download of "{show_name}" done""")
    except IOError:
        log.error(f"{shorthand_uuid} IOError during download. Aborting this download thread ...")
@@ -611,12 +614,14 @@ def copy_url(

    log.info(f"{shorthand_uuid} Moving file to final location '{dest_path}' ...")
    try:
-        os.rename(tmp_path, dest_path)
+        shutil.move(tmp_path, dest_path)
+    except OSError as ose:
+        log.error(f"{shorthand_uuid} Failed moving file with an OSError\n"
+                  f"{ose}\n"
+                  f"Other threads continue unhindered.")
+    else:
        log_successful_download(section_name, config_obj, show, state_file_abs_path, job_uuid, shorthand_uuid)
        log.info(f"{shorthand_uuid} Done moving")
-    except Exception:
-        console.print_exception(show_locals=True)
-        log.error(f"{shorthand_uuid} Failed moving file")


 def get_max_quality_url(
@@ -639,22 +644,28 @@ def get_content_length(
        return 0


-def is_already_downloaded(
-        show: type_def.mvw_json_response.Show,
-        state_file_abs_path: str,
-        show_name: str) -> bool:
+def get_json_state(
+        state_file_abs_path: str) -> json.loads:

-    with open(state_file_abs_path, "r") as state_file:
+    with open(state_file_abs_path, "r", encoding="utf-8") as state_file:
        try:
            json_state = json.load(state_file)
+        except json.JSONDecodeError:
+            return []
+        else:
+            return json_state
+
+
+def is_already_downloaded(
+        show: type_def.mvw_json_response.Show,
+        json_state: json.loads,
+        show_name: str) -> bool:
+
    for log_entry in json_state:
        for log_data in [key for key in log_entry]:
            if show.topic == log_entry[log_data]["topic"] and show.title == log_entry[log_data]["title"]:
                log.debug(f"""Show "{show_name}" already downloaded, won't queue""")
                return True
-        except json.JSONDecodeError:
-            return False
-    return False


 def download_media(
@@ -664,20 +675,13 @@ def download_media(

    global download_start_time
    global download_last_update_time
+    global total_content_length

    dl_threads = config_obj.getint(section_name, "dl_threads")
    state_file_abs_path = get_state_file_abs_path(section_name, config_obj)
    state_lock_file = state_file_abs_path + state_lock_file_ext
    video_metadata = {}

-    for result in json_obj.result.results.copy():
-        max_quality_url = get_max_quality_url(result)
-        content_length = get_content_length(max_quality_url)
-        video_metadata[result.id] = {"url": max_quality_url, "content_length": content_length}
-    total_content_length = 0
-    for video in video_metadata:
-        total_content_length += video_metadata[video]["content_length"]
-    video_metadata["total_content_length"] = total_content_length
    tmp_dir = expanded_dest_dir(config_obj.get(section_name, "tmp_base_dir"))
    dest_dir = expanded_dest_dir(config_obj.get(section_name, "dl_dir"))
    log.info(f"""Download location is {tmp_dir}""")
@@ -686,32 +690,44 @@ def download_media(

    lock = get_state_file_lock(state_lock_file)

+    with lock:
+        state_file_none_or_valid_json(state_file_abs_path)
+        json_state = get_json_state(state_file_abs_path)
+
    with ThreadPoolExecutor(max_workers=dl_threads) as pool:
        download_last_update_time = time.time()
        download_start_time = download_last_update_time
        update_interval = config_obj.getint(section_name, "dl_progress_update_interval")
        log.debug(f"""Will provide updates every {update_interval} {p.plural("second", update_interval)}""")
-        with lock:
-            state_file_none_or_valid_json(state_file_abs_path)
        for result in json_obj.result.results.copy():
            show_name = f"{result.topic} - {result.title}"
-                if not is_already_downloaded(result, state_file_abs_path, show_name):
+            future = None
+            if not is_already_downloaded(result, json_state, show_name):
+                max_quality_url = get_max_quality_url(result)
+                content_length = get_content_length(max_quality_url)
+                video_metadata[result.id] = {"url": max_quality_url, "content_length": content_length}
+                total_content_length += video_metadata[result.id]["content_length"]
+                log.debug(f"Total download size upped to "
+                          f"{humanize.naturalsize(total_content_length, binary=True)}")
+
                job_uuid = str(uuid.uuid4())
                shorthand_uuid = f"[{job_uuid[:2]}..{job_uuid[-2:]}]"
-                    log.debug(f"""Queuing "{show_name}" for download ...""")
-                    pool.submit(
+                log.debug(f"{shorthand_uuid} Job UUID {job_uuid} generated, shorthand is {shorthand_uuid}")
+                log.debug(f"""{shorthand_uuid} Queuing "{show_name}" for download ...""")
+                future = pool.submit(
                    copy_url,
                    section_name,
                    config_obj,
                    result,
                    video_metadata[result.id],
-                        video_metadata["total_content_length"],
                    state_file_abs_path,
                    show_name,
                    job_uuid,
                    shorthand_uuid,
                    tmp_dir,
                    dest_dir)
+        if future is not None:
+            future.result()


 if __name__ == '__main__':
@@ -741,8 +757,6 @@ if __name__ == '__main__':
        if config.has_option(section, "title_not_regex"):
            json_response = dedup_json_titles(section, config, json_response)

-        log.debug(f"Downloading {json_response.result.queryInfo.resultCount} "
+        log.debug(f"Interested in {json_response.result.queryInfo.resultCount} "
                  f"""{p.plural("show", json_response.result.queryInfo.resultCount)} ...""")
        download_media(section, config, json_response)
-
-            # console.print_json(json_response.json())
Author	SHA1	Message	Date
hygienic-books	287a755e65	Streamline download selection	2022-03-20 02:35:10 +01:00
hygienic-books	d70766bae0	If a partial download exists in temp dir we resume it	2022-03-20 02:34:15 +01:00
hygienic-books	380fb4bf2e	Calculate total downloadable content length after deciding which downloads we need	2022-03-20 02:33:32 +01:00
hygienic-books	e395309011	Cosmetics, remove unnecessary lines, rewrite help texts a bit	2022-03-20 02:31:55 +01:00
hygienic-books	029d9ffb7e	When replacing filename pattern strings only log the ones we're seeing in config file	2022-03-20 02:30:23 +01:00
hygienic-books	ec612de2dd	Remove event handler from downloads	2022-03-20 02:29:22 +01:00
hygienic-books	2905ff5c74	Fix encoding for JSON files	2022-03-20 02:28:14 +01:00
hygienic-books	0cfe47465d	Fix encoding for JSON files	2022-03-20 02:27:55 +01:00
hygienic-books	5eff7876bc	Cosmetics, lower urllib3.connectionpool log level back to WARNING	2022-03-20 02:25:39 +01:00
hygienic-books	4809846edf	Correctly calculate download speed if we're resuming	2022-03-20 02:25:02 +01:00
hygienic-books	b5dff485d9	Move downloaded file into target location even across file system boundaries	2022-03-20 02:22:12 +01:00
hygienic-books	e78659b2de	Example JSON query uses 100 results	2022-03-20 02:17:36 +01:00
hygienic-books	27004a5294	By default 'hörfassung' isn't needed for downloads	2022-03-20 02:16:52 +01:00