From 601583afc357ec6027221ea9ac712d097ac753af Mon Sep 17 00:00:00 2001 From: hygienic-books Date: Sat, 19 Mar 2022 08:50:51 +0100 Subject: [PATCH] Maintain a state file --- config.ini | 4 ++- mvw-dl.py | 99 +++++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 93 insertions(+), 10 deletions(-) diff --git a/config.ini b/config.ini index 2571560..263bb3c 100644 --- a/config.ini +++ b/config.ini @@ -12,7 +12,7 @@ dl_progress_update_interval = 10 dl_threads = 2 dl_filename_pattern = &(channel)s - &(publish_date)s - &(topic)s - &(title)s.&(ext)s publish_date_srtftime_pattern = %%Y%%m%%d -dl_filename_replace_spaces = +dl_filename_replace_spaces_with = dl_filename_all_lowercase = no [maus] @@ -20,6 +20,8 @@ min_duration = 1200 max_duration = 2700 query = @maus-query.json title_not_regex = audiodeskription|gebärdensprache +dl_filename_pattern = &(publish_date)s.&(ext)s +publish_date_srtftime_pattern = S%%YE%%Y%%m%%d01 # query = {"queries":[{"fields":["topic"],"query":"die sendung mit der maus"},{"fields":["channel"],"query":"ARD"}],"sortBy":"timestamp","sortOrder":"desc","future":false,"offset":0,"size":50} # state_file_name = maus # tmp_base_dir = %(tmp_base_dir)s/maus diff --git a/mvw-dl.py b/mvw-dl.py index 1647f21..46825b0 100644 --- a/mvw-dl.py +++ b/mvw-dl.py @@ -72,6 +72,7 @@ JSONType = t.Union[str, int, float, bool, None, t.Dict[str, t.Any], t.List[t.Any # 1: Config file invalid, it has no sections # 2: Config file invalid, sections must define at least CONST.CFG_MANDATORY # 3: No search results to download +# 4: State file already exists, has more than 0 bytes size but doesn't contain usable JSON class CONST(object): @@ -94,7 +95,7 @@ class CONST(object): {"key": "dl_threads", "value": "2"}, {"key": "dl_filename_pattern", "value": "&(channel)s - &(publish_date)s - &(topic)s - &(title)s"}, {"key": "publish_date_srtftime_pattern", "value": "%%Y%%m%%d"}, - {"key": "dl_filename_replace_spaces", "value": "_"}, + {"key": "dl_filename_replace_spaces_with", "value": "_"}, {"key": "dl_filename_all_lowercase", "value": "yes"} ] CFG_KNOWN_SECTION = [ @@ -394,7 +395,7 @@ def filename_replace_spaces_with_underscores( section_name: str, config_obj: configparser.ConfigParser(), filename: str) -> str: - space_replace_string = config_obj.get(section_name, "dl_filename_replace_spaces") + space_replace_string = config_obj.get(section_name, "dl_filename_replace_spaces_with") log.debug(f"Replacing space characters with '{space_replace_string}' ...") underscored_filename = re.sub( r"\s", @@ -411,19 +412,91 @@ def get_filename( max_quality_url: str) -> str: filename_replaced_patterns = filename_replace_pattern(section_name, config_obj, show, max_quality_url) filename_safe = get_safe_filename(filename_replaced_patterns) - if config.get(section_name, "dl_filename_replace_spaces"): + if config.get(section_name, "dl_filename_replace_spaces_with"): filename_safe = filename_replace_spaces_with_underscores(section_name, config_obj, filename_safe) if config.getboolean(section_name, "dl_filename_all_lowercase"): log.debug(f"Lowercasing all filename characters ...") filename_safe = filename_safe.lower() log.debug(f"New filename: '{filename_safe}'") log.debug(filename_safe) - quit() + return filename_safe + + +def get_state_file_abs_path( + section_name: str, + config_obj: configparser.ConfigParser()) -> str: + + state_dir = config_obj.get(section_name, "state_files_dir") + state_file = \ + config_obj.get(section_name, "state_file_name_prefix") + \ + section_name + \ + config_obj.get(section_name, "state_file_name_suffix") + state_file_abs_path = os.path.join(state_dir, state_file) + return state_file_abs_path + + +def state_file_none_or_valid_json( + state_file_abs_path: str) -> bool: + + if os.path.exists(state_file_abs_path): + if os.path.getsize(state_file_abs_path) > 0: + with open(state_file_abs_path, "r") as state_file: + try: + json.loads(state_file.read()) + return True + except json.JSONDecodeError: + log.warning(f"State file '{state_file_abs_path}' does not contain valid JSON. We're not going to " + f"be able to log anything into it. Exiting 4 ...") + sys.exit(4) + else: + return True + else: + return True + + +def truncate_log( + json_data: json.loads, + max_log_entries: int) -> json.loads: + + for i in range(len(json_data)): + del json_data[i] + if len(json_data) <= max_log_entries: + break + return json_data def log_successful_download( - show: type_def.mvw_json_response.Show) -> None: - pass + section_name: str, + config_obj: configparser.ConfigParser(), + show: type_def.mvw_json_response.Show, + state_file_abs_path: str) -> None: + + timestamp_now = int(time.time()) + state_file_none_or_valid_json(state_file_abs_path) + os.makedirs(os.path.dirname(state_file_abs_path), exist_ok=True) + + state_body = show.dict(include={"topic", "title"}) + state_body["dl_complete_timestamp_epoch"] = timestamp_now + state_body["dl_complete_timestamp_human"] = \ + d.datetime.utcfromtimestamp(timestamp_now).strftime("%Y-%m-%d %H%M%S UTC") + state_entry = {timestamp_now: state_body} + json_state = None + + log.debug(f"Writing log entry to '{state_file_abs_path}' ...") + with open(state_file_abs_path, "r+") as state_file: + try: + json_state = json.load(state_file) + except json.JSONDecodeError: + if json_state is None: + state_file.truncate() + json_state = [] + + with open(state_file_abs_path, "w") as state_file: + json_state.append(state_entry) + max_log_entries = config_obj.getint(section_name, "state_file_retention") + if len(json_state) > max_log_entries: + json_state = truncate_log(json_state, max_log_entries) + json.dump(json_state, state_file, indent=4, sort_keys=True) def copy_url( @@ -431,7 +504,8 @@ def copy_url( config_obj: configparser.ConfigParser(), show: type_def.mvw_json_response.Show, video_metadata: dict, - total_content_length: int) -> None: + total_content_length: int, + state_file_abs_path: str) -> None: """Copy data from a url to a local file.""" global download_start_time @@ -447,6 +521,9 @@ def copy_url( publish_date = d.datetime.utcfromtimestamp(show.timestamp).strftime('%Y%m%d') os.makedirs(os.path.dirname(dest_path), exist_ok=True) + # TODO quit + log_successful_download(section_name, config_obj, show, state_file_abs_path) + quit() with open(dest_path, "wb") as dest_file: log.info(f"""Downloading "{show_name}" ...""") log.info(f"Download location resolved to {dest_path}") @@ -468,7 +545,7 @@ def copy_url( log.info(f"""Download of "{show_name}" interrupted""") return log.info(f"""Download of "{show_name}" done""") - log_successful_download(show) + # log_successful_download(show) def get_max_quality_url( @@ -500,6 +577,8 @@ def download_media( global download_last_update_time dl_threads = config_obj.getint(section_name, "dl_threads") + state_file_abs_path = get_state_file_abs_path(section_name, config_obj) + state_file_none_or_valid_json(state_file_abs_path) video_metadata = {} for result in json_obj.result.results.copy(): @@ -512,6 +591,7 @@ def download_media( video_metadata["total_content_length"] = total_content_length log.info(f"""Download location is {config_obj.get(section_name, "dl_dir")}""") log.info(f"Limiting parallel downloads to {dl_threads} ...") + # TODO prior to download check state file with ThreadPoolExecutor(max_workers=dl_threads) as pool: download_last_update_time = time.time() download_start_time = download_last_update_time @@ -524,7 +604,8 @@ def download_media( config_obj, result, video_metadata[result.id], - video_metadata["total_content_length"]) + video_metadata["total_content_length"], + state_file_abs_path) if __name__ == '__main__':