Prep for logging and safe filenames

Use user's download location, resolve mentions of ~ (tilde) and environment vars
Download episodes and provide regular updates
2022-03-17 18:41:17 +01:00 · 2022-03-17 17:58:44 +01:00 · 2022-03-17 17:37:57 +01:00 · 2022-03-17 17:23:41 +01:00 · 2022-03-17 16:13:13 +01:00
3 changed files with 111 additions and 63 deletions
--- a/config.ini
+++ b/config.ini
@@ -8,6 +8,8 @@ state_file_name_prefix = state-
 state_file_name_suffix = .log
 mvw_endpoint = http://localhost:8000/api/query
 title_dedup_winner = first
+dl_progress_update_interval = 10
+dl_threads = 2

 [maus]
 min_duration = 1200
--- a/maus-query.json
+++ b/maus-query.json
@@ -17,5 +17,5 @@
    "sortOrder": "desc",
    "future": false,
    "offset": 0,
-    "size": 50
+    "size": 20
 }
--- a/mvw-dl.py
+++ b/mvw-dl.py
@@ -28,6 +28,11 @@ from threading import Event
 from typing import Iterable
 from urllib.request import urlopen

+# TODO set locale for datetime and others to globally stick to en_US
+download_start_time = 0
+download_last_update_time = 0
+size_downloaded = 0
+
 from rich.progress import (
    BarColumn,
    DownloadColumn,
@@ -83,7 +88,9 @@ class CONST(object):
        {"key": "state_file_name_prefix", "value": "state-"},
        {"key": "state_file_name_suffix", "value": ".log"},
        {"key": "mvw_endpoint", "value": "http://localhost:8000/api/query"},
-        {"key": "title_dedup_winner", "value": "first"}
+        {"key": "title_dedup_winner", "value": "first"},
+        {"key": "dl_progress_update_interval", "value": "10"},
+        {"key": "dl_threads", "value": "2"}
    ]
    CFG_KNOWN_SECTION = [
        {"key": "min_duration", "is_mandatory": False},
@@ -338,82 +345,120 @@ def handle_sigint(signum, frame):
 signal.signal(signal.SIGINT, handle_sigint)


-def copy_url(
+def get_safe_filename(
+        dirty_filename: str) -> str:
+    """https://stackoverflow.com/a/71199182"""
+
+    clean_filename = re.sub(r"[/\\?%*:|\"<>\x7F\x00-\x1F]", "-", dirty_filename)
+    return clean_filename
+
+
+def log_successful_download(
        show: type_def.mvw_json_response.Show) -> None:
+    pass
+
+
+def copy_url(
+        section_name: str,
+        config_obj: configparser.ConfigParser(),
+        show: type_def.mvw_json_response.Show,
+        video_metadata: dict,
+        total_content_length: int) -> None:
    """Copy data from a url to a local file."""

-    url = show.url_video_hd
-    filename = url.split("/")[-1]
-    dest_path = os.path.join("./", filename)
-    release_timestamp = d.datetime.utcfromtimestamp(show.timestamp).strftime('%A %x %X')
-    #s = requests.Session()
-    #newline = "\n"
-    #log.debug(f"Request method: {req.method}\n"
-    #          f"URL: {req.url}\n"
-    #          f"""{newline.join(f"Header '{header}': '{value}'" for header, value in list(req.headers.items()))}\n""")
+    global download_start_time
+    global download_last_update_time
+    global size_downloaded
+
+    update_interval = config_obj.getint(section_name, "dl_progress_update_interval")
+    max_quality_url = video_metadata["url"]
+    filename = max_quality_url.split("/")[-1]
+    dest_dir = config_obj.get(section_name, "dl_dir")
+    dest_path = os.path.join(dest_dir, filename)
+    dest_path = os.path.expanduser(dest_path)
+    dest_path = os.path.expandvars(dest_path)
    show_name = f"{show.topic} - {show.title}"
-    log.debug(f"""Downloading "{show_name}" posted {release_timestamp} ...""")
+    publish_date = d.datetime.utcfromtimestamp(show.timestamp).strftime('%Y%m%d')
+
+    os.makedirs(os.path.dirname(dest_path), exist_ok=True)
    with open(dest_path, "wb") as dest_file:
-        last_update_time = time.time()
-        r = requests.get(url, stream=True)
-        total_length = int(r.headers.get('content-length'))
-        size_downloaded = 0
+        log.info(f"""Downloading "{show_name}" ...""")
+        log.info(f"Download location resolved to {dest_path}")
+        r = requests.get(max_quality_url, stream=True)
        for chunk in r.iter_content(32768):
            size_downloaded += len(chunk)
            dest_file.write(chunk)
-            if time.time() - last_update_time >= 10:
-                percentage_done = size_downloaded / total_length * 100
+            if time.time() - download_last_update_time >= update_interval:
+                download_last_update_time = time.time()
+                dl_speed_so_far = size_downloaded / (download_last_update_time - download_start_time)
+                human_dl_speed_so_far = f"{humanize.naturalsize(dl_speed_so_far, binary=True)}/s"
+                percentage_done = size_downloaded / total_content_length * 100
                human_pct = "{:.1f}".format(percentage_done)
-                human_size_dl = humanize.naturalsize(size_downloaded)
-                human_total_dl = humanize.naturalsize(total_length)
-                last_update_time = time.time()
-                log.debug(f"""Download of "{show_name}" at {human_pct}% ({human_size_dl}/{human_total_dl})""")
+                human_size_dl = humanize.naturalsize(size_downloaded, binary=True)
+                human_total_dl = humanize.naturalsize(total_content_length, binary=True)
+                log.debug(f"Downloaded {human_pct}% ({human_size_dl}/{human_total_dl} at an average "
+                          f"{human_dl_speed_so_far})")
            if done_event.is_set():
-                log.debug(f"done_event")
+                log.info(f"""Download of "{show_name}" interrupted""")
                return
-
-        #got_json_response = MVWJSONResponse(**json.loads(s.content))
-        #return got_json_response
-
-    # progress.console.log(f"Requesting {url}")
-    # response = urlopen(url)
-    # # This will break if the response doesn't contain content length
-    # progress.update(task_id, total=int(response.info()["Content-length"]))
-    # with open(path, "wb") as dest_file:
-    #     progress.start_task(task_id)
-    #     for data in iter(partial(response.read, 32768), b""):
-    #         dest_file.write(data)
-    #         progress.update(task_id, advance=len(data))
-    #         if done_event.is_set():
-    #             return
-    # progress.console.log(f"Downloaded {path}")
+    log.info(f"""Download of "{show_name}" done""")
+    log_successful_download(show)


-#def download(urls: Iterable[str], dest_dir: str):
-#    """Download multuple files to the given directory."""
-#
-#    with progress:
-#        with ThreadPoolExecutor(max_workers=1) as pool:
-#            for url in urls:
-#                filename = url.split("/")[-1]
-#                dest_path = os.path.join(dest_dir, filename)
-#                task_id = progress.add_task("download", filename=filename, start=False)
-#                pool.submit(copy_url, task_id, url, dest_path)
+def get_max_quality_url(
+        show: type_def.mvw_json_response.Show) -> str:
+    if show.url_video_hd:
+        max_quality_url = show.url_video_hd
+    elif show.url_video:
+        max_quality_url = show.url_video
+    else:
+        max_quality_url = show.url_video_low
+    return max_quality_url
+
+
+def get_content_length(
+        video_url: str) -> int:
+    r = requests.head(video_url)
+    if r.status_code == requests.codes.ok:
+        return int(r.headers["content-length"])
+    else:
+        return 0


 def download_media(
        section_name: str,
-        config_obj: configparser.ConfigParser()) -> None:
-    with ThreadPoolExecutor(max_workers=2) as pool:
-        for result in json_response.result.results.copy():
-            # filename = url.split("/")[-1]
-            # dest_path = os.path.join(dest_dir, filename)
-            # task_id = progress.add_task("download", filename=filename, start=False)
-            pool.submit(copy_url, result)
-            # TODO before sending into pool validate which url we're going to use
-            # TODO from each url get total content-length
-            # TODO use total content-length for overall progress of what we want to download
-    pass
+        config_obj: configparser.ConfigParser(),
+        json_obj: MVWJSONResponse) -> None:
+
+    global download_start_time
+    global download_last_update_time
+
+    dl_threads = config_obj.getint(section_name, "dl_threads")
+    video_metadata = {}
+
+    for result in json_obj.result.results.copy():
+        max_quality_url = get_max_quality_url(result)
+        content_length = get_content_length(max_quality_url)
+        video_metadata[result.id] = {"url": max_quality_url, "content_length": content_length}
+    total_content_length = 0
+    for video in video_metadata:
+        total_content_length += video_metadata[video]["content_length"]
+    video_metadata["total_content_length"] = total_content_length
+    log.info(f"""Download location is {config_obj.get(section_name, "dl_dir")}""")
+    log.info(f"Limiting parallel downloads to {dl_threads} ...")
+    with ThreadPoolExecutor(max_workers=dl_threads) as pool:
+        download_last_update_time = time.time()
+        download_start_time = download_last_update_time
+        update_interval = config_obj.getint(section_name, "dl_progress_update_interval")
+        log.debug(f"""Will provide updates every {update_interval} {p.plural("second", update_interval)}""")
+        for result in json_obj.result.results.copy():
+            pool.submit(
+                copy_url,
+                section_name,
+                config_obj,
+                result,
+                video_metadata[result.id],
+                video_metadata["total_content_length"])


 if __name__ == '__main__':
@@ -443,7 +488,8 @@ if __name__ == '__main__':
        if config.has_option(section, "title_not_regex"):
            json_response = dedup_json_titles(section, config, json_response)

-        log.debug(f"Downloading shows ...")
-        download_media(section, config)
+        log.debug(f"Downloading {json_response.result.queryInfo.resultCount} "
+                  f"""{p.plural("show", json_response.result.queryInfo.resultCount)} ...""")
+        download_media(section, config, json_response)

            # console.print_json(json_response.json())
Author	SHA1	Message	Date
hygienic-books	25d0059f14	Prep for logging and safe filenames	2022-03-17 18:41:17 +01:00
hygienic-books	296e2ca7e5	Use user's download location, resolve mentions of ~ (tilde) and environment vars	2022-03-17 17:58:44 +01:00
hygienic-books	34ef348929	Download episodes and provide regular updates	2022-03-17 17:37:57 +01:00
hygienic-books	47bb1f761a	Get total content_length and provide just one global stream of download progress updates	2022-03-17 17:23:41 +01:00
hygienic-books	d92236a2f2	Prior to submitting download job to thread pool determine URL for highest quality	2022-03-17 16:13:13 +01:00