From 32932fe803a5156d2c37cbfce369d49e6f82db7f Mon Sep 17 00:00:00 2001 From: hygienic-books Date: Wed, 16 Mar 2022 15:49:29 +0100 Subject: [PATCH] Get unique list of files we want to download --- config.ini | 3 +- maus-query.json | 2 +- mvw-dl.py | 111 ++++++++++++++++++++++++++++++++++++++---------- 3 files changed, 91 insertions(+), 25 deletions(-) diff --git a/config.ini b/config.ini index 1e1046f..c0685af 100644 --- a/config.ini +++ b/config.ini @@ -10,9 +10,10 @@ mvw_endpoint = http://localhost:8000/api/query title_dedup_winner = first [maus] -min_duration = 1800 +min_duration = 1200 max_duration = 2700 query = @maus-query.json +title_not_regex = audiodeskription|gebärdensprache # query = {"queries":[{"fields":["topic"],"query":"die sendung mit der maus"},{"fields":["channel"],"query":"ARD"}],"sortBy":"timestamp","sortOrder":"desc","future":false,"offset":0,"size":50} # state_file_name = maus # tmp_base_dir = %(tmp_base_dir)s/maus diff --git a/maus-query.json b/maus-query.json index 2a45639..731ee89 100644 --- a/maus-query.json +++ b/maus-query.json @@ -17,5 +17,5 @@ "sortOrder": "desc", "future": false, "offset": 0, - "size": 8 + "size": 50 } diff --git a/mvw-dl.py b/mvw-dl.py index 9668455..9762924 100644 --- a/mvw-dl.py +++ b/mvw-dl.py @@ -2,6 +2,7 @@ import configparser import json import logging import os +import re import sys import requests import inflect @@ -9,6 +10,8 @@ from rich.logging import RichHandler from rich.traceback import install import typing as t from rich.console import Console + +import type_def.mvw_json_response from type_def.mvw_json_request import MVWJSONRequest from type_def.mvw_json_response import MVWJSONResponse @@ -185,7 +188,7 @@ def get_json_response( config_obj: configparser.ConfigParser(), payload: MVWJSONRequest) -> MVWJSONResponse: log.debug(f"Downloading JSON list of Mediathek files that match search criteria") - serialized_payload = json.dumps(payload) + serialized_payload = payload.json() url = config_obj.get(section_name, "mvw_endpoint") req_header = {"Content-Type": "text/plain"} s = requests.Session() @@ -197,31 +200,87 @@ def get_json_response( f"""{newline.join(f"Header '{header}': '{value}'" for header, value in list(req.headers.items()))}\n""" f"Payload: {payload}") with s.send(prepped) as s: - # return json.loads(s.content) got_json_response = MVWJSONResponse(**json.loads(s.content)) return got_json_response +def remove_result( + json_obj: MVWJSONResponse, + result_obj: type_def.mvw_json_response.Show) -> MVWJSONResponse: + json_obj.result.results.remove(result_obj) + json_obj.result.queryInfo.resultCount -= 1 + return json_obj + + +def log_result_count(result_count: int, pre_filter: bool = True) -> None: + if pre_filter: + log.debug(f"""Search result contains {result_count} {p.plural("show", result_count)} going in""") + else: + log.debug(f"""Search result now contains {result_count} {p.plural("show", result_count)}""") + + def filter_json_by_duration( section_name: str, config_obj: configparser.ConfigParser(), json_obj: MVWJSONResponse) -> MVWJSONResponse: - min_length = config_obj.getint(section_name, "min_duration") - if min_length >= 0: - log.debug(f"""Filtering JSON for minimum length of {min_length} {p.plural("second", min_length)}""") - console.log(json_obj) - #for result in json_obj["result"]["results"]: - # console.log(result) - # console.log(f"0000000000000000000000") - # if not result["duration"] >= min_length: - # pass - # json_str. - # console.log(f"{result}\n......................") - # console.log(json_obj) - # console.log(f"ssssssssss") + min_duration = config_obj.getint(section_name, "min_duration") + max_duration = config_obj.getint(section_name, "max_duration") + log_result_count(json_obj.result.queryInfo.resultCount) + if min_duration >= 0: + log.debug(f"Filtering '[{section_name}]' JSON for minimum length of {min_duration} " + f"""{p.plural("second", min_duration)} ...""") + for result in json_obj.result.results.copy(): + if not result.duration >= min_duration: + remove_result(json_obj, result) + if max_duration >= 0: + log.debug(f"Filtering '[{section_name}]' JSON for maximum length of {max_duration} " + f"""{p.plural("second", max_duration)} ...""") + for result in json_obj.result.results.copy(): + if not result.duration <= max_duration: + remove_result(json_obj, result) + log_result_count(json_obj.result.queryInfo.resultCount, False) + return json_obj - # json_matches_min_length = - pass + +def filter_json_by_title_regex( + section_name: str, + config_obj: configparser.ConfigParser(), + json_obj: MVWJSONResponse) -> MVWJSONResponse: + title_not_regex = re.compile(config_obj.get(section_name, "title_not_regex"), re.IGNORECASE) + log_result_count(json_obj.result.queryInfo.resultCount) + log.debug(f"Filtering '[{section_name}]' JSON by title regular expression") + for result in json_obj.result.results.copy(): + if title_not_regex.search(result.title): + remove_result(json_obj, result) + log_result_count(json_obj.result.queryInfo.resultCount, False) + return json_obj + + +def dedup_json_titles( + section_name: str, + config_obj: configparser.ConfigParser(), + json_obj: MVWJSONResponse) -> MVWJSONResponse: + title_dedup_winner = config_obj.get(section_name, "title_dedup_winner") + titles_list = {} + log_result_count(json_obj.result.queryInfo.resultCount) + for result in json_obj.result.results.copy(): + if result.title not in titles_list: + titles_list[result.title] = {} + if result.id not in titles_list[result.title]: + titles_list[result.title][result.id] = result.timestamp + for result in titles_list.copy(): + if title_dedup_winner == "first": + dd_winner = min(titles_list[result], key=str) + else: + dd_winner = max(titles_list[result], key=str) + titles_list[result] = dd_winner + for result in json_obj.result.results.copy(): + if result.title in titles_list: + if result.id != titles_list[result.title]: + log.debug(f"""Deduplicating '[{section_name}]' result "{result.title}" ...""") + remove_result(json_obj, result) + log_result_count(json_obj.result.queryInfo.resultCount, False) + return json_obj if __name__ == '__main__': @@ -238,10 +297,16 @@ if __name__ == '__main__': log.debug(f"Processing section '[{section}]' ...") query_payload = get_query_payload(section, config) json_response = get_json_response(section, config, query_payload) - log.debug(json_response) - quit() - log.debug(CONST.CFG_KNOWN_SECTION[0]) - if config.has_option(section, "min_duration") or config.has_option(section, "max_duration"): - json_matches_duration = filter_json_by_duration(section, config, json_response) - # console.log(json_response) + log.debug(f"Filtering results by duration where applicable ...") + if config.has_option(section, "min_duration") or config.has_option(section, "max_duration"): + json_response = filter_json_by_duration(section, config, json_response) + + log.debug(f"Filtering results by title regular expression where applicable ...") + if config.has_option(section, "title_not_regex"): + json_response = filter_json_by_title_regex(section, config, json_response) + + log.debug(f"Deduplicating results by title where needed ...") + if config.has_option(section, "title_not_regex"): + json_response = dedup_json_titles(section, config, json_response) + # console.print_json(json_response.json())