From 601583afc357ec6027221ea9ac712d097ac753af Mon Sep 17 00:00:00 2001
From: hygienic-books <hygienic-books@tentic.net>
Date: Sat, 19 Mar 2022 08:50:51 +0100
Subject: [PATCH] Maintain a state file

---
 config.ini |  4 ++-
 mvw-dl.py  | 99 +++++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 93 insertions(+), 10 deletions(-)

diff --git a/config.ini b/config.ini
index 2571560..263bb3c 100644
--- a/config.ini
+++ b/config.ini
@@ -12,7 +12,7 @@ dl_progress_update_interval = 10
 dl_threads = 2
 dl_filename_pattern = &(channel)s - &(publish_date)s - &(topic)s - &(title)s.&(ext)s
 publish_date_srtftime_pattern = %%Y%%m%%d
-dl_filename_replace_spaces =
+dl_filename_replace_spaces_with =
 dl_filename_all_lowercase = no
 
 [maus]
@@ -20,6 +20,8 @@ min_duration = 1200
 max_duration = 2700
 query = @maus-query.json
 title_not_regex = audiodeskription|gebärdensprache
+dl_filename_pattern = &(publish_date)s.&(ext)s
+publish_date_srtftime_pattern = S%%YE%%Y%%m%%d01
 # query = {"queries":[{"fields":["topic"],"query":"die sendung mit der maus"},{"fields":["channel"],"query":"ARD"}],"sortBy":"timestamp","sortOrder":"desc","future":false,"offset":0,"size":50}
 # state_file_name = maus
 # tmp_base_dir = %(tmp_base_dir)s/maus
diff --git a/mvw-dl.py b/mvw-dl.py
index 1647f21..46825b0 100644
--- a/mvw-dl.py
+++ b/mvw-dl.py
@@ -72,6 +72,7 @@ JSONType = t.Union[str, int, float, bool, None, t.Dict[str, t.Any], t.List[t.Any
 # 1: Config file invalid, it has no sections
 # 2: Config file invalid, sections must define at least CONST.CFG_MANDATORY
 # 3: No search results to download
+# 4: State file already exists, has more than 0 bytes size but doesn't contain usable JSON
 
 
 class CONST(object):
@@ -94,7 +95,7 @@ class CONST(object):
         {"key": "dl_threads", "value": "2"},
         {"key": "dl_filename_pattern", "value": "&(channel)s - &(publish_date)s - &(topic)s - &(title)s"},
         {"key": "publish_date_srtftime_pattern", "value": "%%Y%%m%%d"},
-        {"key": "dl_filename_replace_spaces", "value": "_"},
+        {"key": "dl_filename_replace_spaces_with", "value": "_"},
         {"key": "dl_filename_all_lowercase", "value": "yes"}
     ]
     CFG_KNOWN_SECTION = [
@@ -394,7 +395,7 @@ def filename_replace_spaces_with_underscores(
         section_name: str,
         config_obj: configparser.ConfigParser(),
         filename: str) -> str:
-    space_replace_string = config_obj.get(section_name, "dl_filename_replace_spaces")
+    space_replace_string = config_obj.get(section_name, "dl_filename_replace_spaces_with")
     log.debug(f"Replacing space characters with '{space_replace_string}' ...")
     underscored_filename = re.sub(
         r"\s",
@@ -411,19 +412,91 @@ def get_filename(
         max_quality_url: str) -> str:
     filename_replaced_patterns = filename_replace_pattern(section_name, config_obj, show, max_quality_url)
     filename_safe = get_safe_filename(filename_replaced_patterns)
-    if config.get(section_name, "dl_filename_replace_spaces"):
+    if config.get(section_name, "dl_filename_replace_spaces_with"):
         filename_safe = filename_replace_spaces_with_underscores(section_name, config_obj, filename_safe)
     if config.getboolean(section_name, "dl_filename_all_lowercase"):
         log.debug(f"Lowercasing all filename characters ...")
         filename_safe = filename_safe.lower()
         log.debug(f"New filename: '{filename_safe}'")
     log.debug(filename_safe)
-    quit()
+    return filename_safe
+
+
+def get_state_file_abs_path(
+        section_name: str,
+        config_obj: configparser.ConfigParser()) -> str:
+
+    state_dir = config_obj.get(section_name, "state_files_dir")
+    state_file = \
+        config_obj.get(section_name, "state_file_name_prefix") + \
+        section_name + \
+        config_obj.get(section_name, "state_file_name_suffix")
+    state_file_abs_path = os.path.join(state_dir, state_file)
+    return state_file_abs_path
+
+
+def state_file_none_or_valid_json(
+        state_file_abs_path: str) -> bool:
+
+    if os.path.exists(state_file_abs_path):
+        if os.path.getsize(state_file_abs_path) > 0:
+            with open(state_file_abs_path, "r") as state_file:
+                try:
+                    json.loads(state_file.read())
+                    return True
+                except json.JSONDecodeError:
+                    log.warning(f"State file '{state_file_abs_path}' does not contain valid JSON. We're not going to "
+                                f"be able to log anything into it. Exiting 4 ...")
+                    sys.exit(4)
+        else:
+            return True
+    else:
+        return True
+
+
+def truncate_log(
+        json_data: json.loads,
+        max_log_entries: int) -> json.loads:
+
+    for i in range(len(json_data)):
+        del json_data[i]
+        if len(json_data) <= max_log_entries:
+            break
+    return json_data
 
 
 def log_successful_download(
-        show: type_def.mvw_json_response.Show) -> None:
-    pass
+        section_name: str,
+        config_obj: configparser.ConfigParser(),
+        show: type_def.mvw_json_response.Show,
+        state_file_abs_path: str) -> None:
+
+    timestamp_now = int(time.time())
+    state_file_none_or_valid_json(state_file_abs_path)
+    os.makedirs(os.path.dirname(state_file_abs_path), exist_ok=True)
+
+    state_body = show.dict(include={"topic", "title"})
+    state_body["dl_complete_timestamp_epoch"] = timestamp_now
+    state_body["dl_complete_timestamp_human"] = \
+        d.datetime.utcfromtimestamp(timestamp_now).strftime("%Y-%m-%d %H%M%S UTC")
+    state_entry = {timestamp_now: state_body}
+    json_state = None
+
+    log.debug(f"Writing log entry to '{state_file_abs_path}' ...")
+    with open(state_file_abs_path, "r+") as state_file:
+        try:
+            json_state = json.load(state_file)
+        except json.JSONDecodeError:
+            if json_state is None:
+                state_file.truncate()
+                json_state = []
+
+    with open(state_file_abs_path, "w") as state_file:
+        json_state.append(state_entry)
+        max_log_entries = config_obj.getint(section_name, "state_file_retention")
+        if len(json_state) > max_log_entries:
+            json_state = truncate_log(json_state, max_log_entries)
+        json.dump(json_state, state_file, indent=4, sort_keys=True)
 
 
 def copy_url(
@@ -431,7 +504,8 @@ def copy_url(
         config_obj: configparser.ConfigParser(),
         show: type_def.mvw_json_response.Show,
         video_metadata: dict,
-        total_content_length: int) -> None:
+        total_content_length: int,
+        state_file_abs_path: str) -> None:
     """Copy data from a url to a local file."""
 
     global download_start_time
@@ -447,6 +521,9 @@ def copy_url(
     publish_date = d.datetime.utcfromtimestamp(show.timestamp).strftime('%Y%m%d')
 
     os.makedirs(os.path.dirname(dest_path), exist_ok=True)
+    # TODO quit
+    log_successful_download(section_name, config_obj, show, state_file_abs_path)
+    quit()
     with open(dest_path, "wb") as dest_file:
         log.info(f"""Downloading "{show_name}" ...""")
         log.info(f"Download location resolved to {dest_path}")
@@ -468,7 +545,7 @@ def copy_url(
                 log.info(f"""Download of "{show_name}" interrupted""")
                 return
     log.info(f"""Download of "{show_name}" done""")
-    log_successful_download(show)
+    # log_successful_download(show)
 
 
 def get_max_quality_url(
@@ -500,6 +577,8 @@ def download_media(
     global download_last_update_time
 
     dl_threads = config_obj.getint(section_name, "dl_threads")
+    state_file_abs_path = get_state_file_abs_path(section_name, config_obj)
+    state_file_none_or_valid_json(state_file_abs_path)
     video_metadata = {}
 
     for result in json_obj.result.results.copy():
@@ -512,6 +591,7 @@ def download_media(
     video_metadata["total_content_length"] = total_content_length
     log.info(f"""Download location is {config_obj.get(section_name, "dl_dir")}""")
     log.info(f"Limiting parallel downloads to {dl_threads} ...")
+    # TODO prior to download check state file
     with ThreadPoolExecutor(max_workers=dl_threads) as pool:
         download_last_update_time = time.time()
         download_start_time = download_last_update_time
@@ -524,7 +604,8 @@ def download_media(
                 config_obj,
                 result,
                 video_metadata[result.id],
-                video_metadata["total_content_length"])
+                video_metadata["total_content_length"],
+                state_file_abs_path)
 
 
 if __name__ == '__main__':