Get unique list of files we want to download

This commit is contained in:
hygienic-books 2022-03-16 15:49:29 +01:00
parent c5830aeadc
commit 32932fe803
3 changed files with 91 additions and 25 deletions

View File

@ -10,9 +10,10 @@ mvw_endpoint = http://localhost:8000/api/query
title_dedup_winner = first
[maus]
min_duration = 1800
min_duration = 1200
max_duration = 2700
query = @maus-query.json
title_not_regex = audiodeskription|gebärdensprache
# query = {"queries":[{"fields":["topic"],"query":"die sendung mit der maus"},{"fields":["channel"],"query":"ARD"}],"sortBy":"timestamp","sortOrder":"desc","future":false,"offset":0,"size":50}
# state_file_name = maus
# tmp_base_dir = %(tmp_base_dir)s/maus

View File

@ -17,5 +17,5 @@
"sortOrder": "desc",
"future": false,
"offset": 0,
"size": 8
"size": 50
}

111
mvw-dl.py
View File

@ -2,6 +2,7 @@ import configparser
import json
import logging
import os
import re
import sys
import requests
import inflect
@ -9,6 +10,8 @@ from rich.logging import RichHandler
from rich.traceback import install
import typing as t
from rich.console import Console
import type_def.mvw_json_response
from type_def.mvw_json_request import MVWJSONRequest
from type_def.mvw_json_response import MVWJSONResponse
@ -185,7 +188,7 @@ def get_json_response(
config_obj: configparser.ConfigParser(),
payload: MVWJSONRequest) -> MVWJSONResponse:
log.debug(f"Downloading JSON list of Mediathek files that match search criteria")
serialized_payload = json.dumps(payload)
serialized_payload = payload.json()
url = config_obj.get(section_name, "mvw_endpoint")
req_header = {"Content-Type": "text/plain"}
s = requests.Session()
@ -197,31 +200,87 @@ def get_json_response(
f"""{newline.join(f"Header '{header}': '{value}'" for header, value in list(req.headers.items()))}\n"""
f"Payload: {payload}")
with s.send(prepped) as s:
# return json.loads(s.content)
got_json_response = MVWJSONResponse(**json.loads(s.content))
return got_json_response
def remove_result(
json_obj: MVWJSONResponse,
result_obj: type_def.mvw_json_response.Show) -> MVWJSONResponse:
json_obj.result.results.remove(result_obj)
json_obj.result.queryInfo.resultCount -= 1
return json_obj
def log_result_count(result_count: int, pre_filter: bool = True) -> None:
if pre_filter:
log.debug(f"""Search result contains {result_count} {p.plural("show", result_count)} going in""")
else:
log.debug(f"""Search result now contains {result_count} {p.plural("show", result_count)}""")
def filter_json_by_duration(
section_name: str,
config_obj: configparser.ConfigParser(),
json_obj: MVWJSONResponse) -> MVWJSONResponse:
min_length = config_obj.getint(section_name, "min_duration")
if min_length >= 0:
log.debug(f"""Filtering JSON for minimum length of {min_length} {p.plural("second", min_length)}""")
console.log(json_obj)
#for result in json_obj["result"]["results"]:
# console.log(result)
# console.log(f"0000000000000000000000")
# if not result["duration"] >= min_length:
# pass
# json_str.
# console.log(f"{result}\n......................")
# console.log(json_obj)
# console.log(f"ssssssssss")
min_duration = config_obj.getint(section_name, "min_duration")
max_duration = config_obj.getint(section_name, "max_duration")
log_result_count(json_obj.result.queryInfo.resultCount)
if min_duration >= 0:
log.debug(f"Filtering '[{section_name}]' JSON for minimum length of {min_duration} "
f"""{p.plural("second", min_duration)} ...""")
for result in json_obj.result.results.copy():
if not result.duration >= min_duration:
remove_result(json_obj, result)
if max_duration >= 0:
log.debug(f"Filtering '[{section_name}]' JSON for maximum length of {max_duration} "
f"""{p.plural("second", max_duration)} ...""")
for result in json_obj.result.results.copy():
if not result.duration <= max_duration:
remove_result(json_obj, result)
log_result_count(json_obj.result.queryInfo.resultCount, False)
return json_obj
# json_matches_min_length =
pass
def filter_json_by_title_regex(
section_name: str,
config_obj: configparser.ConfigParser(),
json_obj: MVWJSONResponse) -> MVWJSONResponse:
title_not_regex = re.compile(config_obj.get(section_name, "title_not_regex"), re.IGNORECASE)
log_result_count(json_obj.result.queryInfo.resultCount)
log.debug(f"Filtering '[{section_name}]' JSON by title regular expression")
for result in json_obj.result.results.copy():
if title_not_regex.search(result.title):
remove_result(json_obj, result)
log_result_count(json_obj.result.queryInfo.resultCount, False)
return json_obj
def dedup_json_titles(
section_name: str,
config_obj: configparser.ConfigParser(),
json_obj: MVWJSONResponse) -> MVWJSONResponse:
title_dedup_winner = config_obj.get(section_name, "title_dedup_winner")
titles_list = {}
log_result_count(json_obj.result.queryInfo.resultCount)
for result in json_obj.result.results.copy():
if result.title not in titles_list:
titles_list[result.title] = {}
if result.id not in titles_list[result.title]:
titles_list[result.title][result.id] = result.timestamp
for result in titles_list.copy():
if title_dedup_winner == "first":
dd_winner = min(titles_list[result], key=str)
else:
dd_winner = max(titles_list[result], key=str)
titles_list[result] = dd_winner
for result in json_obj.result.results.copy():
if result.title in titles_list:
if result.id != titles_list[result.title]:
log.debug(f"""Deduplicating '[{section_name}]' result "{result.title}" ...""")
remove_result(json_obj, result)
log_result_count(json_obj.result.queryInfo.resultCount, False)
return json_obj
if __name__ == '__main__':
@ -238,10 +297,16 @@ if __name__ == '__main__':
log.debug(f"Processing section '[{section}]' ...")
query_payload = get_query_payload(section, config)
json_response = get_json_response(section, config, query_payload)
log.debug(json_response)
quit()
log.debug(CONST.CFG_KNOWN_SECTION[0])
if config.has_option(section, "min_duration") or config.has_option(section, "max_duration"):
json_matches_duration = filter_json_by_duration(section, config, json_response)
# console.log(json_response)
log.debug(f"Filtering results by duration where applicable ...")
if config.has_option(section, "min_duration") or config.has_option(section, "max_duration"):
json_response = filter_json_by_duration(section, config, json_response)
log.debug(f"Filtering results by title regular expression where applicable ...")
if config.has_option(section, "title_not_regex"):
json_response = filter_json_by_title_regex(section, config, json_response)
log.debug(f"Deduplicating results by title where needed ...")
if config.has_option(section, "title_not_regex"):
json_response = dedup_json_titles(section, config, json_response)
# console.print_json(json_response.json())